/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.28 - (hide annotations) (download) (as text)
Sun Jul 5 04:38:45 2009 UTC (15 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.27: +13 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Jul 2009 04:38:11 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Updated the result (c.f. HTML5 revision
	3121).

++ whatpm/Whatpm/HTML/ChangeLog	5 Jul 2009 04:38:33 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Reduced the number of parse errors on broken
	DOCTYPE (HTML5 revision 3121).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.28 our $VERSION=do{my @r=(q$Revision: 1.27 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 wakaba 1.1 !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319 wakaba 1.25 } elsif ($self->{is_xml} and
1320     $is_space->{$self->{nc}}) {
1321     !!!cp (97.1);
1322     $self->{ca}->{value} .= ' ';
1323     ## Stay in the state.
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.1 } elsif ($self->{nc} == -1) {
1327     !!!parse-error (type => 'unclosed attribute value');
1328     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329     !!!cp (97);
1330     $self->{last_stag_name} = $self->{ct}->{tag_name};
1331 wakaba 1.15
1332     $self->{state} = DATA_STATE;
1333     $self->{s_kwd} = '';
1334     ## reconsume
1335     !!!emit ($self->{ct}); # start tag
1336     redo A;
1337 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340     !!!cp (98);
1341     !!!parse-error (type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344     !!!cp (99);
1345     }
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # end tag
1351     redo A;
1352     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353     ## XML5: No parse error above; not defined yet.
1354     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356     ## Reconsume.
1357     !!!emit ($self->{ct}); # ATTLIST
1358     redo A;
1359 wakaba 1.1 } else {
1360     die "$0: $self->{ct}->{type}: Unknown token type";
1361     }
1362     } else {
1363 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1364 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365     !!!cp (100);
1366     ## XML5: Not a parse error.
1367     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368     } else {
1369     !!!cp (100.1);
1370     }
1371 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1372     $self->{read_until}->($self->{ca}->{value},
1373 wakaba 1.25 qq["&<\x09\x0C\x20],
1374 wakaba 1.1 length $self->{ca}->{value});
1375    
1376     ## Stay in the state
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382     ## ATTLIST attribute value single quoted state".
1383 wakaba 1.11
1384 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1385 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386     !!!cp (101.1);
1387     ## XML5: "DOCTYPE ATTLIST name after state".
1388     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390     } else {
1391     !!!cp (101);
1392     ## XML5: "Before attribute name state" (sic).
1393     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394     }
1395 wakaba 1.1 !!!next-input-character;
1396     redo A;
1397     } elsif ($self->{nc} == 0x0026) { # &
1398     !!!cp (102);
1399 wakaba 1.11 ## XML5: Not defined yet.
1400    
1401 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1402     ## "entity in attribute value state". In this implementation, the
1403     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404     ## implementation of the "consume a character reference" algorithm.
1405     $self->{entity_add} = 0x0027; # '
1406     $self->{prev_state} = $self->{state};
1407     $self->{state} = ENTITY_STATE;
1408     !!!next-input-character;
1409     redo A;
1410 wakaba 1.25 } elsif ($self->{is_xml} and
1411     $is_space->{$self->{nc}}) {
1412     !!!cp (103.1);
1413     $self->{ca}->{value} .= ' ';
1414     ## Stay in the state.
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.1 } elsif ($self->{nc} == -1) {
1418     !!!parse-error (type => 'unclosed attribute value');
1419     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420     !!!cp (103);
1421     $self->{last_stag_name} = $self->{ct}->{tag_name};
1422 wakaba 1.15
1423     $self->{state} = DATA_STATE;
1424     $self->{s_kwd} = '';
1425     ## reconsume
1426     !!!emit ($self->{ct}); # start tag
1427     redo A;
1428 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430     if ($self->{ct}->{attributes}) {
1431     !!!cp (104);
1432     !!!parse-error (type => 'end tag attribute');
1433     } else {
1434     ## NOTE: This state should never be reached.
1435     !!!cp (105);
1436     }
1437 wakaba 1.15
1438     $self->{state} = DATA_STATE;
1439     $self->{s_kwd} = '';
1440     ## reconsume
1441     !!!emit ($self->{ct}); # end tag
1442     redo A;
1443     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444     ## XML5: No parse error above; not defined yet.
1445     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447     ## Reconsume.
1448     !!!emit ($self->{ct}); # ATTLIST
1449     redo A;
1450 wakaba 1.1 } else {
1451     die "$0: $self->{ct}->{type}: Unknown token type";
1452     }
1453     } else {
1454 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1455 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456     !!!cp (106);
1457     ## XML5: Not a parse error.
1458     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459     } else {
1460     !!!cp (106.1);
1461     }
1462 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1463     $self->{read_until}->($self->{ca}->{value},
1464 wakaba 1.25 qq['&<\x09\x0C\x20],
1465 wakaba 1.1 length $self->{ca}->{value});
1466    
1467     ## Stay in the state
1468     !!!next-input-character;
1469     redo A;
1470     }
1471     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1473    
1474 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1475 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476     !!!cp (107.1);
1477     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479     } else {
1480     !!!cp (107);
1481     ## XML5: "Tag attribute name before state".
1482     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483     }
1484 wakaba 1.1 !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x0026) { # &
1487     !!!cp (108);
1488 wakaba 1.11
1489     ## XML5: Not defined yet.
1490    
1491 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1492     ## "entity in attribute value state". In this implementation, the
1493     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494     ## implementation of the "consume a character reference" algorithm.
1495     $self->{entity_add} = -1;
1496     $self->{prev_state} = $self->{state};
1497     $self->{state} = ENTITY_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x003E) { # >
1501     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502     !!!cp (109);
1503     $self->{last_stag_name} = $self->{ct}->{tag_name};
1504 wakaba 1.15
1505     $self->{state} = DATA_STATE;
1506     $self->{s_kwd} = '';
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # start tag
1509     redo A;
1510 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513     !!!cp (110);
1514     !!!parse-error (type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517     !!!cp (111);
1518     }
1519 wakaba 1.15
1520     $self->{state} = DATA_STATE;
1521     $self->{s_kwd} = '';
1522     !!!next-input-character;
1523     !!!emit ($self->{ct}); # end tag
1524     redo A;
1525     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528     !!!next-input-character;
1529     !!!emit ($self->{ct}); # ATTLIST
1530     redo A;
1531 wakaba 1.1 } else {
1532     die "$0: $self->{ct}->{type}: Unknown token type";
1533     }
1534     } elsif ($self->{nc} == -1) {
1535     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536     !!!cp (112);
1537 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1538 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539 wakaba 1.15
1540     $self->{state} = DATA_STATE;
1541     $self->{s_kwd} = '';
1542     ## reconsume
1543     !!!emit ($self->{ct}); # start tag
1544     redo A;
1545 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548     if ($self->{ct}->{attributes}) {
1549     !!!cp (113);
1550     !!!parse-error (type => 'end tag attribute');
1551     } else {
1552     ## NOTE: This state should never be reached.
1553     !!!cp (114);
1554     }
1555 wakaba 1.15
1556     $self->{state} = DATA_STATE;
1557     $self->{s_kwd} = '';
1558     ## reconsume
1559     !!!emit ($self->{ct}); # end tag
1560     redo A;
1561     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562     !!!parse-error (type => 'unclosed md'); ## TODO: type
1563     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565     ## Reconsume.
1566     !!!emit ($self->{ct}); # ATTLIST
1567     redo A;
1568 wakaba 1.1 } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     } else {
1572     if ({
1573     0x0022 => 1, # "
1574     0x0027 => 1, # '
1575     0x003D => 1, # =
1576 wakaba 1.26 0x003C => 1, # <
1577 wakaba 1.1 }->{$self->{nc}}) {
1578     !!!cp (115);
1579 wakaba 1.11 ## XML5: Not a parse error.
1580 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1581     } else {
1582     !!!cp (116);
1583     }
1584     $self->{ca}->{value} .= chr ($self->{nc});
1585     $self->{read_until}->($self->{ca}->{value},
1586 wakaba 1.25 qq["'=& \x09\x0C>],
1587 wakaba 1.1 length $self->{ca}->{value});
1588    
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     }
1593     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594     if ($is_space->{$self->{nc}}) {
1595     !!!cp (118);
1596     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == 0x003E) { # >
1600     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601     !!!cp (119);
1602     $self->{last_stag_name} = $self->{ct}->{tag_name};
1603     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605     if ($self->{ct}->{attributes}) {
1606     !!!cp (120);
1607     !!!parse-error (type => 'end tag attribute');
1608     } else {
1609     ## NOTE: This state should never be reached.
1610     !!!cp (121);
1611     }
1612     } else {
1613     die "$0: $self->{ct}->{type}: Unknown token type";
1614     }
1615     $self->{state} = DATA_STATE;
1616 wakaba 1.5 $self->{s_kwd} = '';
1617 wakaba 1.1 !!!next-input-character;
1618    
1619     !!!emit ($self->{ct}); # start tag or end tag
1620    
1621     redo A;
1622     } elsif ($self->{nc} == 0x002F) { # /
1623     !!!cp (122);
1624     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625     !!!next-input-character;
1626     redo A;
1627     } elsif ($self->{nc} == -1) {
1628     !!!parse-error (type => 'unclosed tag');
1629     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630     !!!cp (122.3);
1631     $self->{last_stag_name} = $self->{ct}->{tag_name};
1632     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp (122.1);
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     ## NOTE: This state should never be reached.
1638     !!!cp (122.2);
1639     }
1640     } else {
1641     die "$0: $self->{ct}->{type}: Unknown token type";
1642     }
1643     $self->{state} = DATA_STATE;
1644 wakaba 1.5 $self->{s_kwd} = '';
1645 wakaba 1.1 ## Reconsume.
1646     !!!emit ($self->{ct}); # start tag or end tag
1647     redo A;
1648     } else {
1649     !!!cp ('124.1');
1650     !!!parse-error (type => 'no space between attributes');
1651     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652     ## reconsume
1653     redo A;
1654     }
1655     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 wakaba 1.11 ## XML5: "Empty tag state".
1657    
1658 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1659     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     !!!cp ('124.2');
1661     !!!parse-error (type => 'nestc', token => $self->{ct});
1662     ## TODO: Different type than slash in start tag
1663     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664     if ($self->{ct}->{attributes}) {
1665     !!!cp ('124.4');
1666     !!!parse-error (type => 'end tag attribute');
1667     } else {
1668     !!!cp ('124.5');
1669     }
1670     ## TODO: Test |<title></title/>|
1671     } else {
1672     !!!cp ('124.3');
1673     $self->{self_closing} = 1;
1674     }
1675    
1676     $self->{state} = DATA_STATE;
1677 wakaba 1.5 $self->{s_kwd} = '';
1678 wakaba 1.1 !!!next-input-character;
1679    
1680     !!!emit ($self->{ct}); # start tag or end tag
1681    
1682     redo A;
1683     } elsif ($self->{nc} == -1) {
1684     !!!parse-error (type => 'unclosed tag');
1685     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686     !!!cp (124.7);
1687     $self->{last_stag_name} = $self->{ct}->{tag_name};
1688     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689     if ($self->{ct}->{attributes}) {
1690     !!!cp (124.5);
1691     !!!parse-error (type => 'end tag attribute');
1692     } else {
1693     ## NOTE: This state should never be reached.
1694     !!!cp (124.6);
1695     }
1696     } else {
1697     die "$0: $self->{ct}->{type}: Unknown token type";
1698     }
1699 wakaba 1.11 ## XML5: "Tag attribute name before state".
1700 wakaba 1.1 $self->{state} = DATA_STATE;
1701 wakaba 1.5 $self->{s_kwd} = '';
1702 wakaba 1.1 ## Reconsume.
1703     !!!emit ($self->{ct}); # start tag or end tag
1704     redo A;
1705     } else {
1706     !!!cp ('124.4');
1707     !!!parse-error (type => 'nestc');
1708     ## TODO: This error type is wrong.
1709     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710     ## Reconsume.
1711     redo A;
1712     }
1713     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715    
1716 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717     ## consumes characters one-by-one basis.
1718    
1719     if ($self->{nc} == 0x003E) { # >
1720 wakaba 1.13 if ($self->{in_subset}) {
1721     !!!cp (123);
1722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723     } else {
1724     !!!cp (124);
1725     $self->{state} = DATA_STATE;
1726     $self->{s_kwd} = '';
1727     }
1728 wakaba 1.1 !!!next-input-character;
1729    
1730     !!!emit ($self->{ct}); # comment
1731     redo A;
1732     } elsif ($self->{nc} == -1) {
1733 wakaba 1.13 if ($self->{in_subset}) {
1734     !!!cp (125.1);
1735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736     } else {
1737     !!!cp (125);
1738     $self->{state} = DATA_STATE;
1739     $self->{s_kwd} = '';
1740     }
1741 wakaba 1.1 ## reconsume
1742    
1743     !!!emit ($self->{ct}); # comment
1744     redo A;
1745     } else {
1746     !!!cp (126);
1747     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748     $self->{read_until}->($self->{ct}->{data},
1749     q[>],
1750     length $self->{ct}->{data});
1751    
1752     ## Stay in the state.
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 wakaba 1.14 ## XML5: "Markup declaration state".
1758 wakaba 1.1
1759     if ($self->{nc} == 0x002D) { # -
1760     !!!cp (133);
1761     $self->{state} = MD_HYPHEN_STATE;
1762     !!!next-input-character;
1763     redo A;
1764     } elsif ($self->{nc} == 0x0044 or # D
1765     $self->{nc} == 0x0064) { # d
1766     ## ASCII case-insensitive.
1767     !!!cp (130);
1768     $self->{state} = MD_DOCTYPE_STATE;
1769 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1770 wakaba 1.1 !!!next-input-character;
1771     redo A;
1772 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774     $self->{is_xml}) and
1775 wakaba 1.1 $self->{nc} == 0x005B) { # [
1776     !!!cp (135.4);
1777     $self->{state} = MD_CDATA_STATE;
1778 wakaba 1.12 $self->{kwd} = '[';
1779 wakaba 1.1 !!!next-input-character;
1780     redo A;
1781     } else {
1782     !!!cp (136);
1783     }
1784    
1785     !!!parse-error (type => 'bogus comment',
1786     line => $self->{line_prev},
1787     column => $self->{column_prev} - 1);
1788     ## Reconsume.
1789     $self->{state} = BOGUS_COMMENT_STATE;
1790     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791     line => $self->{line_prev},
1792     column => $self->{column_prev} - 1,
1793     };
1794     redo A;
1795     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796     if ($self->{nc} == 0x002D) { # -
1797     !!!cp (127);
1798     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 2,
1801     };
1802 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 wakaba 1.1 !!!next-input-character;
1804     redo A;
1805     } else {
1806     !!!cp (128);
1807     !!!parse-error (type => 'bogus comment',
1808     line => $self->{line_prev},
1809     column => $self->{column_prev} - 2);
1810     $self->{state} = BOGUS_COMMENT_STATE;
1811     ## Reconsume.
1812     $self->{ct} = {type => COMMENT_TOKEN,
1813     data => '-',
1814     line => $self->{line_prev},
1815     column => $self->{column_prev} - 2,
1816     };
1817     redo A;
1818     }
1819     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820     ## ASCII case-insensitive.
1821     if ($self->{nc} == [
1822     undef,
1823     0x004F, # O
1824     0x0043, # C
1825     0x0054, # T
1826     0x0059, # Y
1827     0x0050, # P
1828 wakaba 1.12 ]->[length $self->{kwd}] or
1829 wakaba 1.1 $self->{nc} == [
1830     undef,
1831     0x006F, # o
1832     0x0063, # c
1833     0x0074, # t
1834     0x0079, # y
1835     0x0070, # p
1836 wakaba 1.12 ]->[length $self->{kwd}]) {
1837 wakaba 1.1 !!!cp (131);
1838     ## Stay in the state.
1839 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1840 wakaba 1.1 !!!next-input-character;
1841     redo A;
1842 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1843 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1844     $self->{nc} == 0x0065)) { # e
1845 wakaba 1.12 if ($self->{is_xml} and
1846     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 wakaba 1.10 !!!cp (129);
1848     ## XML5: case-sensitive.
1849     !!!parse-error (type => 'lowercase keyword', ## TODO
1850     text => 'DOCTYPE',
1851     line => $self->{line_prev},
1852     column => $self->{column_prev} - 5);
1853     } else {
1854     !!!cp (129.1);
1855     }
1856 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1857     $self->{ct} = {type => DOCTYPE_TOKEN,
1858     quirks => 1,
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 7,
1861     };
1862     !!!next-input-character;
1863     redo A;
1864     } else {
1865     !!!cp (132);
1866     !!!parse-error (type => 'bogus comment',
1867     line => $self->{line_prev},
1868 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1870     ## Reconsume.
1871     $self->{ct} = {type => COMMENT_TOKEN,
1872 wakaba 1.12 data => $self->{kwd},
1873 wakaba 1.1 line => $self->{line_prev},
1874 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 wakaba 1.1 };
1876     redo A;
1877     }
1878     } elsif ($self->{state} == MD_CDATA_STATE) {
1879     if ($self->{nc} == {
1880     '[' => 0x0043, # C
1881     '[C' => 0x0044, # D
1882     '[CD' => 0x0041, # A
1883     '[CDA' => 0x0054, # T
1884     '[CDAT' => 0x0041, # A
1885 wakaba 1.12 }->{$self->{kwd}}) {
1886 wakaba 1.1 !!!cp (135.1);
1887     ## Stay in the state.
1888 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1889 wakaba 1.1 !!!next-input-character;
1890     redo A;
1891 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1892 wakaba 1.1 $self->{nc} == 0x005B) { # [
1893 wakaba 1.6 if ($self->{is_xml} and
1894     not $self->{tainted} and
1895     @{$self->{open_elements} or []} == 0) {
1896 wakaba 1.8 !!!cp (135.2);
1897 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1898     line => $self->{line_prev},
1899     column => $self->{column_prev} - 7);
1900     $self->{tainted} = 1;
1901 wakaba 1.8 } else {
1902     !!!cp (135.21);
1903 wakaba 1.6 }
1904    
1905 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1906     data => '',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev} - 7};
1909     $self->{state} = CDATA_SECTION_STATE;
1910     !!!next-input-character;
1911     redo A;
1912     } else {
1913     !!!cp (135.3);
1914     !!!parse-error (type => 'bogus comment',
1915     line => $self->{line_prev},
1916 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1918     ## Reconsume.
1919     $self->{ct} = {type => COMMENT_TOKEN,
1920 wakaba 1.12 data => $self->{kwd},
1921 wakaba 1.1 line => $self->{line_prev},
1922 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 wakaba 1.1 };
1924     redo A;
1925     }
1926     } elsif ($self->{state} == COMMENT_START_STATE) {
1927     if ($self->{nc} == 0x002D) { # -
1928     !!!cp (137);
1929     $self->{state} = COMMENT_START_DASH_STATE;
1930     !!!next-input-character;
1931     redo A;
1932     } elsif ($self->{nc} == 0x003E) { # >
1933     !!!parse-error (type => 'bogus comment');
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (138.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (138);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 !!!next-input-character;
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } elsif ($self->{nc} == -1) {
1948     !!!parse-error (type => 'unclosed comment');
1949 wakaba 1.13 if ($self->{in_subset}) {
1950     !!!cp (139.1);
1951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952     } else {
1953     !!!cp (139);
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     }
1957 wakaba 1.1 ## reconsume
1958    
1959     !!!emit ($self->{ct}); # comment
1960    
1961     redo A;
1962     } else {
1963     !!!cp (140);
1964     $self->{ct}->{data} # comment
1965     .= chr ($self->{nc});
1966     $self->{state} = COMMENT_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     }
1970     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971     if ($self->{nc} == 0x002D) { # -
1972     !!!cp (141);
1973     $self->{state} = COMMENT_END_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} == 0x003E) { # >
1977     !!!parse-error (type => 'bogus comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (142.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (142);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 !!!next-input-character;
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (143.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (143);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (144);
2008     $self->{ct}->{data} # comment
2009     .= '-' . chr ($self->{nc});
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == COMMENT_STATE) {
2015 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2016    
2017 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2018     !!!cp (145);
2019     $self->{state} = COMMENT_END_DASH_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     } elsif ($self->{nc} == -1) {
2023     !!!parse-error (type => 'unclosed comment');
2024 wakaba 1.13 if ($self->{in_subset}) {
2025     !!!cp (146.1);
2026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027     } else {
2028     !!!cp (146);
2029     $self->{state} = DATA_STATE;
2030     $self->{s_kwd} = '';
2031     }
2032 wakaba 1.1 ## reconsume
2033    
2034     !!!emit ($self->{ct}); # comment
2035    
2036     redo A;
2037     } else {
2038     !!!cp (147);
2039     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040     $self->{read_until}->($self->{ct}->{data},
2041     q[-],
2042     length $self->{ct}->{data});
2043    
2044     ## Stay in the state
2045     !!!next-input-character;
2046     redo A;
2047     }
2048     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050 wakaba 1.10
2051 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2052     !!!cp (148);
2053     $self->{state} = COMMENT_END_STATE;
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == -1) {
2057     !!!parse-error (type => 'unclosed comment');
2058 wakaba 1.13 if ($self->{in_subset}) {
2059     !!!cp (149.1);
2060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061     } else {
2062     !!!cp (149);
2063     $self->{state} = DATA_STATE;
2064     $self->{s_kwd} = '';
2065     }
2066 wakaba 1.1 ## reconsume
2067    
2068     !!!emit ($self->{ct}); # comment
2069    
2070     redo A;
2071     } else {
2072     !!!cp (150);
2073     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074     $self->{state} = COMMENT_STATE;
2075     !!!next-input-character;
2076     redo A;
2077     }
2078     } elsif ($self->{state} == COMMENT_END_STATE) {
2079 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080    
2081 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2082 wakaba 1.13 if ($self->{in_subset}) {
2083     !!!cp (151.1);
2084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085     } else {
2086     !!!cp (151);
2087     $self->{state} = DATA_STATE;
2088     $self->{s_kwd} = '';
2089     }
2090 wakaba 1.1 !!!next-input-character;
2091    
2092     !!!emit ($self->{ct}); # comment
2093    
2094     redo A;
2095     } elsif ($self->{nc} == 0x002D) { # -
2096     !!!cp (152);
2097 wakaba 1.10 ## XML5: Not a parse error.
2098 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev});
2101     $self->{ct}->{data} .= '-'; # comment
2102     ## Stay in the state
2103     !!!next-input-character;
2104     redo A;
2105     } elsif ($self->{nc} == -1) {
2106     !!!parse-error (type => 'unclosed comment');
2107 wakaba 1.13 if ($self->{in_subset}) {
2108     !!!cp (153.1);
2109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110     } else {
2111     !!!cp (153);
2112     $self->{state} = DATA_STATE;
2113     $self->{s_kwd} = '';
2114     }
2115 wakaba 1.1 ## reconsume
2116    
2117     !!!emit ($self->{ct}); # comment
2118    
2119     redo A;
2120     } else {
2121     !!!cp (154);
2122     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123     $self->{state} = COMMENT_STATE;
2124     !!!next-input-character;
2125     redo A;
2126     }
2127     } elsif ($self->{state} == DOCTYPE_STATE) {
2128     if ($is_space->{$self->{nc}}) {
2129     !!!cp (155);
2130     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2131     !!!next-input-character;
2132     redo A;
2133 wakaba 1.28 } elsif ($self->{nc} == -1) {
2134     !!!cp (155.1);
2135     !!!parse-error (type => 'unclosed DOCTYPE');
2136     $self->{ct}->{quirks} = 1;
2137    
2138     $self->{state} = DATA_STATE;
2139     ## Reconsume.
2140     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141    
2142     redo A;
2143 wakaba 1.1 } else {
2144     !!!cp (156);
2145 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2146 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2147     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2148     ## reconsume
2149     redo A;
2150     }
2151     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2152 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2153    
2154 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2155     !!!cp (157);
2156     ## Stay in the state
2157     !!!next-input-character;
2158     redo A;
2159     } elsif ($self->{nc} == 0x003E) { # >
2160     !!!cp (158);
2161 wakaba 1.12 ## XML5: No parse error.
2162 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2163     $self->{state} = DATA_STATE;
2164 wakaba 1.5 $self->{s_kwd} = '';
2165 wakaba 1.1 !!!next-input-character;
2166    
2167     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2168    
2169     redo A;
2170     } elsif ($self->{nc} == -1) {
2171     !!!cp (159);
2172     !!!parse-error (type => 'no DOCTYPE name');
2173     $self->{state} = DATA_STATE;
2174 wakaba 1.5 $self->{s_kwd} = '';
2175 wakaba 1.1 ## reconsume
2176    
2177     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2178    
2179     redo A;
2180 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2181     !!!cp (159.1);
2182     !!!parse-error (type => 'no DOCTYPE name');
2183     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2184 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2185     $self->{in_subset} = 1;
2186 wakaba 1.12 !!!next-input-character;
2187 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2188 wakaba 1.12 redo A;
2189 wakaba 1.1 } else {
2190     !!!cp (160);
2191     $self->{ct}->{name} = chr $self->{nc};
2192     delete $self->{ct}->{quirks};
2193     $self->{state} = DOCTYPE_NAME_STATE;
2194     !!!next-input-character;
2195     redo A;
2196     }
2197     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2198 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2199    
2200     ## ISSUE: Redundant "First," in the spec.
2201    
2202 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2203     !!!cp (161);
2204     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2205     !!!next-input-character;
2206     redo A;
2207     } elsif ($self->{nc} == 0x003E) { # >
2208     !!!cp (162);
2209     $self->{state} = DATA_STATE;
2210 wakaba 1.5 $self->{s_kwd} = '';
2211 wakaba 1.1 !!!next-input-character;
2212    
2213     !!!emit ($self->{ct}); # DOCTYPE
2214    
2215     redo A;
2216     } elsif ($self->{nc} == -1) {
2217     !!!cp (163);
2218     !!!parse-error (type => 'unclosed DOCTYPE');
2219     $self->{state} = DATA_STATE;
2220 wakaba 1.5 $self->{s_kwd} = '';
2221 wakaba 1.1 ## reconsume
2222    
2223     $self->{ct}->{quirks} = 1;
2224     !!!emit ($self->{ct}); # DOCTYPE
2225    
2226     redo A;
2227 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2228     !!!cp (163.1);
2229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2230 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2231     $self->{in_subset} = 1;
2232 wakaba 1.12 !!!next-input-character;
2233 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2234 wakaba 1.12 redo A;
2235 wakaba 1.1 } else {
2236     !!!cp (164);
2237     $self->{ct}->{name}
2238     .= chr ($self->{nc}); # DOCTYPE
2239     ## Stay in the state
2240     !!!next-input-character;
2241     redo A;
2242     }
2243     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2244 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2245     ## state", but implemented differently.
2246    
2247 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2248     !!!cp (165);
2249     ## Stay in the state
2250     !!!next-input-character;
2251     redo A;
2252     } elsif ($self->{nc} == 0x003E) { # >
2253 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2254     !!!cp (166);
2255     $self->{state} = DATA_STATE;
2256     $self->{s_kwd} = '';
2257     } else {
2258     !!!cp (166.1);
2259     !!!parse-error (type => 'no md def'); ## TODO: type
2260     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2261     }
2262    
2263 wakaba 1.1 !!!next-input-character;
2264 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2265 wakaba 1.1 redo A;
2266     } elsif ($self->{nc} == -1) {
2267 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2268     !!!cp (167);
2269     !!!parse-error (type => 'unclosed DOCTYPE');
2270     $self->{state} = DATA_STATE;
2271     $self->{s_kwd} = '';
2272     $self->{ct}->{quirks} = 1;
2273     } else {
2274     !!!cp (167.12);
2275     !!!parse-error (type => 'unclosed md'); ## TODO: type
2276     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2277     }
2278    
2279     ## Reconsume.
2280     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2281 wakaba 1.1 redo A;
2282     } elsif ($self->{nc} == 0x0050 or # P
2283     $self->{nc} == 0x0070) { # p
2284 wakaba 1.12 !!!cp (167.1);
2285 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2286 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2287 wakaba 1.1 !!!next-input-character;
2288     redo A;
2289     } elsif ($self->{nc} == 0x0053 or # S
2290     $self->{nc} == 0x0073) { # s
2291 wakaba 1.12 !!!cp (167.2);
2292 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2293 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2294     !!!next-input-character;
2295     redo A;
2296 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2297     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2298     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2299     !!!cp (167.21);
2300     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2301     $self->{ct}->{value} = ''; # ENTITY
2302     !!!next-input-character;
2303     redo A;
2304     } elsif ($self->{nc} == 0x0027 and # '
2305     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2306     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2307     !!!cp (167.22);
2308     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2309     $self->{ct}->{value} = ''; # ENTITY
2310     !!!next-input-character;
2311     redo A;
2312 wakaba 1.16 } elsif ($self->{is_xml} and
2313     $self->{ct}->{type} == DOCTYPE_TOKEN and
2314     $self->{nc} == 0x005B) { # [
2315 wakaba 1.12 !!!cp (167.3);
2316     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2317     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2318 wakaba 1.13 $self->{in_subset} = 1;
2319 wakaba 1.1 !!!next-input-character;
2320 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2321 wakaba 1.1 redo A;
2322     } else {
2323 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2324    
2325     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2326     !!!cp (180);
2327     $self->{ct}->{quirks} = 1;
2328     $self->{state} = BOGUS_DOCTYPE_STATE;
2329     } else {
2330     !!!cp (180.1);
2331     $self->{state} = BOGUS_MD_STATE;
2332     }
2333 wakaba 1.1
2334     !!!next-input-character;
2335     redo A;
2336     }
2337     } elsif ($self->{state} == PUBLIC_STATE) {
2338     ## ASCII case-insensitive
2339     if ($self->{nc} == [
2340     undef,
2341     0x0055, # U
2342     0x0042, # B
2343     0x004C, # L
2344     0x0049, # I
2345 wakaba 1.12 ]->[length $self->{kwd}] or
2346 wakaba 1.1 $self->{nc} == [
2347     undef,
2348     0x0075, # u
2349     0x0062, # b
2350     0x006C, # l
2351     0x0069, # i
2352 wakaba 1.12 ]->[length $self->{kwd}]) {
2353 wakaba 1.1 !!!cp (175);
2354     ## Stay in the state.
2355 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2356 wakaba 1.1 !!!next-input-character;
2357     redo A;
2358 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2359 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2360     $self->{nc} == 0x0063)) { # c
2361 wakaba 1.12 if ($self->{is_xml} and
2362     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2363     !!!cp (168.1);
2364     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2365     text => 'PUBLIC',
2366     line => $self->{line_prev},
2367     column => $self->{column_prev} - 4);
2368     } else {
2369     !!!cp (168);
2370     }
2371 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2372     !!!next-input-character;
2373     redo A;
2374     } else {
2375 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2376 wakaba 1.1 line => $self->{line_prev},
2377 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2378 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2379     !!!cp (169);
2380     $self->{ct}->{quirks} = 1;
2381     $self->{state} = BOGUS_DOCTYPE_STATE;
2382     } else {
2383     !!!cp (169.1);
2384     $self->{state} = BOGUS_MD_STATE;
2385     }
2386 wakaba 1.1 ## Reconsume.
2387     redo A;
2388     }
2389     } elsif ($self->{state} == SYSTEM_STATE) {
2390     ## ASCII case-insensitive
2391     if ($self->{nc} == [
2392     undef,
2393     0x0059, # Y
2394     0x0053, # S
2395     0x0054, # T
2396     0x0045, # E
2397 wakaba 1.12 ]->[length $self->{kwd}] or
2398 wakaba 1.1 $self->{nc} == [
2399     undef,
2400     0x0079, # y
2401     0x0073, # s
2402     0x0074, # t
2403     0x0065, # e
2404 wakaba 1.12 ]->[length $self->{kwd}]) {
2405 wakaba 1.1 !!!cp (170);
2406     ## Stay in the state.
2407 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2408 wakaba 1.1 !!!next-input-character;
2409     redo A;
2410 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2411 wakaba 1.1 ($self->{nc} == 0x004D or # M
2412     $self->{nc} == 0x006D)) { # m
2413 wakaba 1.12 if ($self->{is_xml} and
2414     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2415     !!!cp (171.1);
2416     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2417     text => 'SYSTEM',
2418     line => $self->{line_prev},
2419     column => $self->{column_prev} - 4);
2420     } else {
2421     !!!cp (171);
2422     }
2423 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2424     !!!next-input-character;
2425     redo A;
2426     } else {
2427 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2428 wakaba 1.1 line => $self->{line_prev},
2429 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2430 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2431     !!!cp (172);
2432     $self->{ct}->{quirks} = 1;
2433     $self->{state} = BOGUS_DOCTYPE_STATE;
2434     } else {
2435     !!!cp (172.1);
2436     $self->{state} = BOGUS_MD_STATE;
2437     }
2438 wakaba 1.1 ## Reconsume.
2439     redo A;
2440     }
2441     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2442     if ($is_space->{$self->{nc}}) {
2443     !!!cp (181);
2444     ## Stay in the state
2445     !!!next-input-character;
2446     redo A;
2447     } elsif ($self->{nc} eq 0x0022) { # "
2448     !!!cp (182);
2449     $self->{ct}->{pubid} = ''; # DOCTYPE
2450     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2451     !!!next-input-character;
2452     redo A;
2453     } elsif ($self->{nc} eq 0x0027) { # '
2454     !!!cp (183);
2455     $self->{ct}->{pubid} = ''; # DOCTYPE
2456     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2457     !!!next-input-character;
2458     redo A;
2459     } elsif ($self->{nc} eq 0x003E) { # >
2460     !!!parse-error (type => 'no PUBLIC literal');
2461 wakaba 1.16
2462     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2463     !!!cp (184);
2464     $self->{state} = DATA_STATE;
2465     $self->{s_kwd} = '';
2466     $self->{ct}->{quirks} = 1;
2467     } else {
2468     !!!cp (184.1);
2469     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2470     }
2471    
2472 wakaba 1.1 !!!next-input-character;
2473 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2474 wakaba 1.1 redo A;
2475     } elsif ($self->{nc} == -1) {
2476 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2477     !!!cp (185);
2478     !!!parse-error (type => 'unclosed DOCTYPE');
2479     $self->{state} = DATA_STATE;
2480     $self->{s_kwd} = '';
2481     $self->{ct}->{quirks} = 1;
2482     } else {
2483     !!!cp (185.1);
2484     !!!parse-error (type => 'unclosed md'); ## TODO: type
2485     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2486     }
2487    
2488 wakaba 1.1 ## reconsume
2489     !!!emit ($self->{ct}); # DOCTYPE
2490     redo A;
2491 wakaba 1.16 } elsif ($self->{is_xml} and
2492     $self->{ct}->{type} == DOCTYPE_TOKEN and
2493     $self->{nc} == 0x005B) { # [
2494 wakaba 1.12 !!!cp (186.1);
2495     !!!parse-error (type => 'no PUBLIC literal');
2496     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2497     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2498 wakaba 1.13 $self->{in_subset} = 1;
2499 wakaba 1.12 !!!next-input-character;
2500 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2501 wakaba 1.12 redo A;
2502 wakaba 1.1 } else {
2503     !!!parse-error (type => 'string after PUBLIC');
2504    
2505 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2506     !!!cp (186);
2507     $self->{ct}->{quirks} = 1;
2508     $self->{state} = BOGUS_DOCTYPE_STATE;
2509     } else {
2510     !!!cp (186.2);
2511     $self->{state} = BOGUS_MD_STATE;
2512     }
2513    
2514 wakaba 1.1 !!!next-input-character;
2515     redo A;
2516     }
2517     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2518     if ($self->{nc} == 0x0022) { # "
2519     !!!cp (187);
2520     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2521     !!!next-input-character;
2522     redo A;
2523     } elsif ($self->{nc} == 0x003E) { # >
2524     !!!parse-error (type => 'unclosed PUBLIC literal');
2525    
2526 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2527     !!!cp (188);
2528     $self->{state} = DATA_STATE;
2529     $self->{s_kwd} = '';
2530     $self->{ct}->{quirks} = 1;
2531     } else {
2532     !!!cp (188.1);
2533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2534     }
2535    
2536 wakaba 1.1 !!!next-input-character;
2537 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2538 wakaba 1.1 redo A;
2539     } elsif ($self->{nc} == -1) {
2540     !!!parse-error (type => 'unclosed PUBLIC literal');
2541    
2542 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2543     !!!cp (189);
2544     $self->{state} = DATA_STATE;
2545     $self->{s_kwd} = '';
2546     $self->{ct}->{quirks} = 1;
2547     } else {
2548     !!!cp (189.1);
2549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2550     }
2551    
2552     ## Reconsume.
2553 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2554     redo A;
2555     } else {
2556     !!!cp (190);
2557 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2558 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2559     length $self->{ct}->{pubid});
2560    
2561     ## Stay in the state
2562     !!!next-input-character;
2563     redo A;
2564     }
2565     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2566     if ($self->{nc} == 0x0027) { # '
2567     !!!cp (191);
2568     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2569     !!!next-input-character;
2570     redo A;
2571     } elsif ($self->{nc} == 0x003E) { # >
2572     !!!parse-error (type => 'unclosed PUBLIC literal');
2573    
2574 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2575     !!!cp (192);
2576     $self->{state} = DATA_STATE;
2577     $self->{s_kwd} = '';
2578     $self->{ct}->{quirks} = 1;
2579     } else {
2580     !!!cp (192.1);
2581     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2582     }
2583    
2584 wakaba 1.1 !!!next-input-character;
2585 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2586 wakaba 1.1 redo A;
2587     } elsif ($self->{nc} == -1) {
2588     !!!parse-error (type => 'unclosed PUBLIC literal');
2589    
2590 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2591     !!!cp (193);
2592     $self->{state} = DATA_STATE;
2593     $self->{s_kwd} = '';
2594     $self->{ct}->{quirks} = 1;
2595     } else {
2596     !!!cp (193.1);
2597     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2598     }
2599    
2600 wakaba 1.1 ## reconsume
2601 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2602 wakaba 1.1 redo A;
2603     } else {
2604     !!!cp (194);
2605 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2606 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2607     length $self->{ct}->{pubid});
2608    
2609     ## Stay in the state
2610     !!!next-input-character;
2611     redo A;
2612     }
2613     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2614     if ($is_space->{$self->{nc}}) {
2615     !!!cp (195);
2616     ## Stay in the state
2617     !!!next-input-character;
2618     redo A;
2619     } elsif ($self->{nc} == 0x0022) { # "
2620     !!!cp (196);
2621 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2622 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2623     !!!next-input-character;
2624     redo A;
2625     } elsif ($self->{nc} == 0x0027) { # '
2626     !!!cp (197);
2627 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2628 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2629     !!!next-input-character;
2630     redo A;
2631     } elsif ($self->{nc} == 0x003E) { # >
2632 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2633     if ($self->{is_xml}) {
2634     !!!cp (198.1);
2635     !!!parse-error (type => 'no SYSTEM literal');
2636     } else {
2637     !!!cp (198);
2638     }
2639     $self->{state} = DATA_STATE;
2640     $self->{s_kwd} = '';
2641 wakaba 1.12 } else {
2642 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2643     !!!cp (198.2);
2644     } else {
2645     !!!cp (198.3);
2646     !!!parse-error (type => 'no SYSTEM literal');
2647     }
2648     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2649 wakaba 1.12 }
2650 wakaba 1.16
2651 wakaba 1.1 !!!next-input-character;
2652 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2653 wakaba 1.1 redo A;
2654     } elsif ($self->{nc} == -1) {
2655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2656     !!!cp (199);
2657     !!!parse-error (type => 'unclosed DOCTYPE');
2658    
2659     $self->{state} = DATA_STATE;
2660     $self->{s_kwd} = '';
2661     $self->{ct}->{quirks} = 1;
2662     } else {
2663     !!!parse-error (type => 'unclosed md'); ## TODO: type
2664     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2665     }
2666    
2667 wakaba 1.1 ## reconsume
2668 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2669 wakaba 1.1 redo A;
2670 wakaba 1.16 } elsif ($self->{is_xml} and
2671     $self->{ct}->{type} == DOCTYPE_TOKEN and
2672     $self->{nc} == 0x005B) { # [
2673 wakaba 1.12 !!!cp (200.1);
2674     !!!parse-error (type => 'no SYSTEM literal');
2675     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2676     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2677 wakaba 1.13 $self->{in_subset} = 1;
2678 wakaba 1.12 !!!next-input-character;
2679 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2680 wakaba 1.12 redo A;
2681 wakaba 1.1 } else {
2682     !!!parse-error (type => 'string after PUBLIC literal');
2683    
2684 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2685     !!!cp (200);
2686     $self->{ct}->{quirks} = 1;
2687     $self->{state} = BOGUS_DOCTYPE_STATE;
2688     } else {
2689     !!!cp (200.2);
2690     $self->{state} = BOGUS_MD_STATE;
2691     }
2692    
2693 wakaba 1.1 !!!next-input-character;
2694     redo A;
2695     }
2696     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2697     if ($is_space->{$self->{nc}}) {
2698     !!!cp (201);
2699     ## Stay in the state
2700     !!!next-input-character;
2701     redo A;
2702     } elsif ($self->{nc} == 0x0022) { # "
2703     !!!cp (202);
2704     $self->{ct}->{sysid} = ''; # DOCTYPE
2705     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2706     !!!next-input-character;
2707     redo A;
2708     } elsif ($self->{nc} == 0x0027) { # '
2709     !!!cp (203);
2710     $self->{ct}->{sysid} = ''; # DOCTYPE
2711     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2712     !!!next-input-character;
2713     redo A;
2714     } elsif ($self->{nc} == 0x003E) { # >
2715     !!!parse-error (type => 'no SYSTEM literal');
2716     !!!next-input-character;
2717    
2718 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2719     !!!cp (204);
2720     $self->{state} = DATA_STATE;
2721     $self->{s_kwd} = '';
2722     $self->{ct}->{quirks} = 1;
2723     } else {
2724     !!!cp (204.1);
2725     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2726     }
2727 wakaba 1.1
2728 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2729 wakaba 1.1 redo A;
2730     } elsif ($self->{nc} == -1) {
2731 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2732     !!!cp (205);
2733     !!!parse-error (type => 'unclosed DOCTYPE');
2734     $self->{state} = DATA_STATE;
2735     $self->{s_kwd} = '';
2736     $self->{ct}->{quirks} = 1;
2737     } else {
2738     !!!cp (205.1);
2739     !!!parse-error (type => 'unclosed md'); ## TODO: type
2740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741     }
2742    
2743 wakaba 1.1 ## reconsume
2744 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2745 wakaba 1.1 redo A;
2746 wakaba 1.16 } elsif ($self->{is_xml} and
2747     $self->{ct}->{type} == DOCTYPE_TOKEN and
2748     $self->{nc} == 0x005B) { # [
2749 wakaba 1.12 !!!cp (206.1);
2750     !!!parse-error (type => 'no SYSTEM literal');
2751    
2752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2753     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2754 wakaba 1.13 $self->{in_subset} = 1;
2755 wakaba 1.12 !!!next-input-character;
2756 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2757 wakaba 1.12 redo A;
2758 wakaba 1.1 } else {
2759     !!!parse-error (type => 'string after SYSTEM');
2760    
2761 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2762     !!!cp (206);
2763     $self->{ct}->{quirks} = 1;
2764     $self->{state} = BOGUS_DOCTYPE_STATE;
2765     } else {
2766     !!!cp (206.2);
2767     $self->{state} = BOGUS_MD_STATE;
2768     }
2769    
2770 wakaba 1.1 !!!next-input-character;
2771     redo A;
2772     }
2773     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2774     if ($self->{nc} == 0x0022) { # "
2775     !!!cp (207);
2776     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2777     !!!next-input-character;
2778     redo A;
2779 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2780 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2781    
2782 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2783     !!!cp (208);
2784     $self->{state} = DATA_STATE;
2785     $self->{s_kwd} = '';
2786     $self->{ct}->{quirks} = 1;
2787     } else {
2788     !!!cp (208.1);
2789     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2790     }
2791    
2792 wakaba 1.1 !!!next-input-character;
2793 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2794 wakaba 1.1 redo A;
2795     } elsif ($self->{nc} == -1) {
2796     !!!parse-error (type => 'unclosed SYSTEM literal');
2797    
2798 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2799     !!!cp (209);
2800     $self->{state} = DATA_STATE;
2801     $self->{s_kwd} = '';
2802     $self->{ct}->{quirks} = 1;
2803     } else {
2804     !!!cp (209.1);
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     }
2807    
2808 wakaba 1.1 ## reconsume
2809 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2810 wakaba 1.1 redo A;
2811     } else {
2812     !!!cp (210);
2813 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2814 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2815     length $self->{ct}->{sysid});
2816    
2817     ## Stay in the state
2818     !!!next-input-character;
2819     redo A;
2820     }
2821     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2822     if ($self->{nc} == 0x0027) { # '
2823     !!!cp (211);
2824     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2825     !!!next-input-character;
2826     redo A;
2827 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2828 wakaba 1.1 !!!cp (212);
2829     !!!parse-error (type => 'unclosed SYSTEM literal');
2830    
2831     $self->{state} = DATA_STATE;
2832 wakaba 1.5 $self->{s_kwd} = '';
2833 wakaba 1.1 !!!next-input-character;
2834    
2835     $self->{ct}->{quirks} = 1;
2836     !!!emit ($self->{ct}); # DOCTYPE
2837    
2838     redo A;
2839     } elsif ($self->{nc} == -1) {
2840     !!!parse-error (type => 'unclosed SYSTEM literal');
2841    
2842 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2843     !!!cp (213);
2844     $self->{state} = DATA_STATE;
2845     $self->{s_kwd} = '';
2846     $self->{ct}->{quirks} = 1;
2847     } else {
2848     !!!cp (213.1);
2849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850     }
2851    
2852 wakaba 1.1 ## reconsume
2853 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2854 wakaba 1.1 redo A;
2855     } else {
2856     !!!cp (214);
2857 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2858 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2859     length $self->{ct}->{sysid});
2860    
2861     ## Stay in the state
2862     !!!next-input-character;
2863     redo A;
2864     }
2865     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2866     if ($is_space->{$self->{nc}}) {
2867 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2868     !!!cp (215.1);
2869     $self->{state} = BEFORE_NDATA_STATE;
2870     } else {
2871     !!!cp (215);
2872     ## Stay in the state
2873     }
2874 wakaba 1.1 !!!next-input-character;
2875     redo A;
2876     } elsif ($self->{nc} == 0x003E) { # >
2877 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2878     !!!cp (216);
2879     $self->{state} = DATA_STATE;
2880     $self->{s_kwd} = '';
2881     } else {
2882     !!!cp (216.1);
2883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2884     }
2885    
2886 wakaba 1.1 !!!next-input-character;
2887 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2888 wakaba 1.1 redo A;
2889 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2890     ($self->{nc} == 0x004E or # N
2891     $self->{nc} == 0x006E)) { # n
2892     !!!cp (216.2);
2893     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2894     $self->{state} = NDATA_STATE;
2895     $self->{kwd} = chr $self->{nc};
2896     !!!next-input-character;
2897     redo A;
2898 wakaba 1.1 } elsif ($self->{nc} == -1) {
2899 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900     !!!cp (217);
2901     !!!parse-error (type => 'unclosed DOCTYPE');
2902     $self->{state} = DATA_STATE;
2903     $self->{s_kwd} = '';
2904     $self->{ct}->{quirks} = 1;
2905     } else {
2906     !!!cp (217.1);
2907     !!!parse-error (type => 'unclosed md'); ## TODO: type
2908     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2909     }
2910    
2911 wakaba 1.1 ## reconsume
2912 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2913 wakaba 1.1 redo A;
2914 wakaba 1.16 } elsif ($self->{is_xml} and
2915     $self->{ct}->{type} == DOCTYPE_TOKEN and
2916     $self->{nc} == 0x005B) { # [
2917 wakaba 1.12 !!!cp (218.1);
2918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2919     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2920 wakaba 1.13 $self->{in_subset} = 1;
2921 wakaba 1.12 !!!next-input-character;
2922 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2923 wakaba 1.12 redo A;
2924 wakaba 1.1 } else {
2925     !!!parse-error (type => 'string after SYSTEM literal');
2926    
2927 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2928     !!!cp (218);
2929     #$self->{ct}->{quirks} = 1;
2930     $self->{state} = BOGUS_DOCTYPE_STATE;
2931     } else {
2932     !!!cp (218.2);
2933     $self->{state} = BOGUS_MD_STATE;
2934     }
2935    
2936 wakaba 1.1 !!!next-input-character;
2937     redo A;
2938     }
2939 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2940     if ($is_space->{$self->{nc}}) {
2941     !!!cp (218.3);
2942     ## Stay in the state.
2943     !!!next-input-character;
2944     redo A;
2945     } elsif ($self->{nc} == 0x003E) { # >
2946     !!!cp (218.4);
2947     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2948     !!!next-input-character;
2949     !!!emit ($self->{ct}); # ENTITY
2950     redo A;
2951     } elsif ($self->{nc} == 0x004E or # N
2952     $self->{nc} == 0x006E) { # n
2953     !!!cp (218.5);
2954     $self->{state} = NDATA_STATE;
2955     $self->{kwd} = chr $self->{nc};
2956     !!!next-input-character;
2957     redo A;
2958     } elsif ($self->{nc} == -1) {
2959     !!!cp (218.6);
2960     !!!parse-error (type => 'unclosed md'); ## TODO: type
2961     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2962     ## reconsume
2963     !!!emit ($self->{ct}); # ENTITY
2964     redo A;
2965     } else {
2966     !!!cp (218.7);
2967     !!!parse-error (type => 'string after SYSTEM literal');
2968     $self->{state} = BOGUS_MD_STATE;
2969     !!!next-input-character;
2970     redo A;
2971     }
2972 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2973     if ($self->{nc} == 0x003E) { # >
2974     !!!cp (219);
2975     $self->{state} = DATA_STATE;
2976 wakaba 1.5 $self->{s_kwd} = '';
2977 wakaba 1.1 !!!next-input-character;
2978    
2979     !!!emit ($self->{ct}); # DOCTYPE
2980    
2981     redo A;
2982 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2983 wakaba 1.13 !!!cp (220.1);
2984     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2985     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2986     $self->{in_subset} = 1;
2987     !!!next-input-character;
2988     !!!emit ($self->{ct}); # DOCTYPE
2989     redo A;
2990 wakaba 1.1 } elsif ($self->{nc} == -1) {
2991     !!!cp (220);
2992     $self->{state} = DATA_STATE;
2993 wakaba 1.5 $self->{s_kwd} = '';
2994 wakaba 1.1 ## reconsume
2995    
2996     !!!emit ($self->{ct}); # DOCTYPE
2997    
2998     redo A;
2999     } else {
3000     !!!cp (221);
3001     my $s = '';
3002 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3003 wakaba 1.1
3004     ## Stay in the state
3005     !!!next-input-character;
3006     redo A;
3007     }
3008     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3009     ## NOTE: "CDATA section state" in the state is jointly implemented
3010     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3011     ## and |CDATA_SECTION_MSE2_STATE|.
3012 wakaba 1.10
3013     ## XML5: "CDATA state".
3014 wakaba 1.1
3015     if ($self->{nc} == 0x005D) { # ]
3016     !!!cp (221.1);
3017     $self->{state} = CDATA_SECTION_MSE1_STATE;
3018     !!!next-input-character;
3019     redo A;
3020     } elsif ($self->{nc} == -1) {
3021 wakaba 1.6 if ($self->{is_xml}) {
3022 wakaba 1.8 !!!cp (221.11);
3023 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3024 wakaba 1.8 } else {
3025     !!!cp (221.12);
3026 wakaba 1.6 }
3027    
3028 wakaba 1.1 $self->{state} = DATA_STATE;
3029 wakaba 1.5 $self->{s_kwd} = '';
3030 wakaba 1.10 ## Reconsume.
3031 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3032     !!!cp (221.2);
3033     !!!emit ($self->{ct}); # character
3034     } else {
3035     !!!cp (221.3);
3036     ## No token to emit. $self->{ct} is discarded.
3037     }
3038     redo A;
3039     } else {
3040     !!!cp (221.4);
3041     $self->{ct}->{data} .= chr $self->{nc};
3042     $self->{read_until}->($self->{ct}->{data},
3043     q<]>,
3044     length $self->{ct}->{data});
3045    
3046     ## Stay in the state.
3047     !!!next-input-character;
3048     redo A;
3049     }
3050    
3051     ## ISSUE: "text tokens" in spec.
3052     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3053 wakaba 1.10 ## XML5: "CDATA bracket state".
3054    
3055 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3056     !!!cp (221.5);
3057     $self->{state} = CDATA_SECTION_MSE2_STATE;
3058     !!!next-input-character;
3059     redo A;
3060     } else {
3061     !!!cp (221.6);
3062 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3063 wakaba 1.1 $self->{ct}->{data} .= ']';
3064 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3065 wakaba 1.1 ## Reconsume.
3066     redo A;
3067     }
3068     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3069 wakaba 1.10 ## XML5: "CDATA end state".
3070    
3071 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3072     $self->{state} = DATA_STATE;
3073 wakaba 1.5 $self->{s_kwd} = '';
3074 wakaba 1.1 !!!next-input-character;
3075     if (length $self->{ct}->{data}) { # character
3076     !!!cp (221.7);
3077     !!!emit ($self->{ct}); # character
3078     } else {
3079     !!!cp (221.8);
3080     ## No token to emit. $self->{ct} is discarded.
3081     }
3082     redo A;
3083     } elsif ($self->{nc} == 0x005D) { # ]
3084     !!!cp (221.9); # character
3085     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3086     ## Stay in the state.
3087     !!!next-input-character;
3088     redo A;
3089     } else {
3090     !!!cp (221.11);
3091     $self->{ct}->{data} .= ']]'; # character
3092     $self->{state} = CDATA_SECTION_STATE;
3093 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3094 wakaba 1.1 redo A;
3095     }
3096     } elsif ($self->{state} == ENTITY_STATE) {
3097     if ($is_space->{$self->{nc}} or
3098     {
3099     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3100     $self->{entity_add} => 1,
3101     }->{$self->{nc}}) {
3102 wakaba 1.22 if ($self->{is_xml}) {
3103     !!!cp (1001.1);
3104     !!!parse-error (type => 'bare ero',
3105     line => $self->{line_prev},
3106     column => $self->{column_prev}
3107     + ($self->{nc} == -1 ? 1 : 0));
3108     } else {
3109     !!!cp (1001);
3110     ## No error
3111     }
3112 wakaba 1.1 ## Don't consume
3113     ## Return nothing.
3114     #
3115     } elsif ($self->{nc} == 0x0023) { # #
3116     !!!cp (999);
3117     $self->{state} = ENTITY_HASH_STATE;
3118 wakaba 1.12 $self->{kwd} = '#';
3119 wakaba 1.1 !!!next-input-character;
3120     redo A;
3121 wakaba 1.22 } elsif ($self->{is_xml} or
3122     (0x0041 <= $self->{nc} and
3123 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3124     (0x0061 <= $self->{nc} and
3125     $self->{nc} <= 0x007A)) { # a..z
3126     !!!cp (998);
3127     require Whatpm::_NamedEntityList;
3128     $self->{state} = ENTITY_NAME_STATE;
3129 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3130     $self->{entity__value} = $self->{kwd};
3131 wakaba 1.1 $self->{entity__match} = 0;
3132     !!!next-input-character;
3133     redo A;
3134     } else {
3135     !!!cp (1027);
3136     !!!parse-error (type => 'bare ero');
3137     ## Return nothing.
3138     #
3139     }
3140    
3141     ## NOTE: No character is consumed by the "consume a character
3142     ## reference" algorithm. In other word, there is an "&" character
3143     ## that does not introduce a character reference, which would be
3144     ## appended to the parent element or the attribute value in later
3145     ## process of the tokenizer.
3146    
3147     if ($self->{prev_state} == DATA_STATE) {
3148     !!!cp (997);
3149     $self->{state} = $self->{prev_state};
3150 wakaba 1.5 $self->{s_kwd} = '';
3151 wakaba 1.1 ## Reconsume.
3152     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3153     line => $self->{line_prev},
3154     column => $self->{column_prev},
3155     });
3156     redo A;
3157     } else {
3158     !!!cp (996);
3159     $self->{ca}->{value} .= '&';
3160     $self->{state} = $self->{prev_state};
3161 wakaba 1.5 $self->{s_kwd} = '';
3162 wakaba 1.1 ## Reconsume.
3163     redo A;
3164     }
3165     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3166 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3167 wakaba 1.1 !!!cp (995);
3168     $self->{state} = HEXREF_X_STATE;
3169 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3170 wakaba 1.1 !!!next-input-character;
3171     redo A;
3172 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3173     !!!cp (995.1);
3174     if ($self->{is_xml}) {
3175     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3176     }
3177     $self->{state} = HEXREF_X_STATE;
3178     $self->{kwd} .= chr $self->{nc};
3179     !!!next-input-character;
3180     redo A;
3181 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3182     $self->{nc} <= 0x0039) { # 0..9
3183     !!!cp (994);
3184     $self->{state} = NCR_NUM_STATE;
3185 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3186 wakaba 1.1 !!!next-input-character;
3187     redo A;
3188     } else {
3189     !!!parse-error (type => 'bare nero',
3190     line => $self->{line_prev},
3191     column => $self->{column_prev} - 1);
3192    
3193     ## NOTE: According to the spec algorithm, nothing is returned,
3194     ## and then "&#" is appended to the parent element or the attribute
3195     ## value in the later processing.
3196    
3197     if ($self->{prev_state} == DATA_STATE) {
3198     !!!cp (1019);
3199     $self->{state} = $self->{prev_state};
3200 wakaba 1.5 $self->{s_kwd} = '';
3201 wakaba 1.1 ## Reconsume.
3202     !!!emit ({type => CHARACTER_TOKEN,
3203     data => '&#',
3204     line => $self->{line_prev},
3205     column => $self->{column_prev} - 1,
3206     });
3207     redo A;
3208     } else {
3209     !!!cp (993);
3210     $self->{ca}->{value} .= '&#';
3211     $self->{state} = $self->{prev_state};
3212 wakaba 1.5 $self->{s_kwd} = '';
3213 wakaba 1.1 ## Reconsume.
3214     redo A;
3215     }
3216     }
3217     } elsif ($self->{state} == NCR_NUM_STATE) {
3218     if (0x0030 <= $self->{nc} and
3219     $self->{nc} <= 0x0039) { # 0..9
3220     !!!cp (1012);
3221 wakaba 1.12 $self->{kwd} *= 10;
3222     $self->{kwd} += $self->{nc} - 0x0030;
3223 wakaba 1.1
3224     ## Stay in the state.
3225     !!!next-input-character;
3226     redo A;
3227     } elsif ($self->{nc} == 0x003B) { # ;
3228     !!!cp (1013);
3229     !!!next-input-character;
3230     #
3231     } else {
3232     !!!cp (1014);
3233     !!!parse-error (type => 'no refc');
3234     ## Reconsume.
3235     #
3236     }
3237    
3238 wakaba 1.12 my $code = $self->{kwd};
3239 wakaba 1.1 my $l = $self->{line_prev};
3240     my $c = $self->{column_prev};
3241 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3242     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3243     ($self->{is_xml} and $code == 0x0000)) {
3244 wakaba 1.1 !!!cp (1015);
3245     !!!parse-error (type => 'invalid character reference',
3246     text => (sprintf 'U+%04X', $code),
3247     line => $l, column => $c);
3248     $code = $charref_map->{$code};
3249     } elsif ($code > 0x10FFFF) {
3250     !!!cp (1016);
3251     !!!parse-error (type => 'invalid character reference',
3252     text => (sprintf 'U-%08X', $code),
3253     line => $l, column => $c);
3254     $code = 0xFFFD;
3255     }
3256    
3257     if ($self->{prev_state} == DATA_STATE) {
3258     !!!cp (992);
3259     $self->{state} = $self->{prev_state};
3260 wakaba 1.5 $self->{s_kwd} = '';
3261 wakaba 1.1 ## Reconsume.
3262     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3263 wakaba 1.7 has_reference => 1,
3264 wakaba 1.1 line => $l, column => $c,
3265     });
3266     redo A;
3267     } else {
3268     !!!cp (991);
3269     $self->{ca}->{value} .= chr $code;
3270     $self->{ca}->{has_reference} = 1;
3271     $self->{state} = $self->{prev_state};
3272 wakaba 1.5 $self->{s_kwd} = '';
3273 wakaba 1.1 ## Reconsume.
3274     redo A;
3275     }
3276     } elsif ($self->{state} == HEXREF_X_STATE) {
3277     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3278     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3279     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3280     # 0..9, A..F, a..f
3281     !!!cp (990);
3282     $self->{state} = HEXREF_HEX_STATE;
3283 wakaba 1.12 $self->{kwd} = 0;
3284 wakaba 1.1 ## Reconsume.
3285     redo A;
3286     } else {
3287     !!!parse-error (type => 'bare hcro',
3288     line => $self->{line_prev},
3289     column => $self->{column_prev} - 2);
3290    
3291     ## NOTE: According to the spec algorithm, nothing is returned,
3292     ## and then "&#" followed by "X" or "x" is appended to the parent
3293     ## element or the attribute value in the later processing.
3294    
3295     if ($self->{prev_state} == DATA_STATE) {
3296     !!!cp (1005);
3297     $self->{state} = $self->{prev_state};
3298 wakaba 1.5 $self->{s_kwd} = '';
3299 wakaba 1.1 ## Reconsume.
3300     !!!emit ({type => CHARACTER_TOKEN,
3301 wakaba 1.12 data => '&' . $self->{kwd},
3302 wakaba 1.1 line => $self->{line_prev},
3303 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3304 wakaba 1.1 });
3305     redo A;
3306     } else {
3307     !!!cp (989);
3308 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3309 wakaba 1.1 $self->{state} = $self->{prev_state};
3310 wakaba 1.5 $self->{s_kwd} = '';
3311 wakaba 1.1 ## Reconsume.
3312     redo A;
3313     }
3314     }
3315     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3316     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3317     # 0..9
3318     !!!cp (1002);
3319 wakaba 1.12 $self->{kwd} *= 0x10;
3320     $self->{kwd} += $self->{nc} - 0x0030;
3321 wakaba 1.1 ## Stay in the state.
3322     !!!next-input-character;
3323     redo A;
3324     } elsif (0x0061 <= $self->{nc} and
3325     $self->{nc} <= 0x0066) { # a..f
3326     !!!cp (1003);
3327 wakaba 1.12 $self->{kwd} *= 0x10;
3328     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3329 wakaba 1.1 ## Stay in the state.
3330     !!!next-input-character;
3331     redo A;
3332     } elsif (0x0041 <= $self->{nc} and
3333     $self->{nc} <= 0x0046) { # A..F
3334     !!!cp (1004);
3335 wakaba 1.12 $self->{kwd} *= 0x10;
3336     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3337 wakaba 1.1 ## Stay in the state.
3338     !!!next-input-character;
3339     redo A;
3340     } elsif ($self->{nc} == 0x003B) { # ;
3341     !!!cp (1006);
3342     !!!next-input-character;
3343     #
3344     } else {
3345     !!!cp (1007);
3346     !!!parse-error (type => 'no refc',
3347     line => $self->{line},
3348     column => $self->{column});
3349     ## Reconsume.
3350     #
3351     }
3352    
3353 wakaba 1.12 my $code = $self->{kwd};
3354 wakaba 1.1 my $l = $self->{line_prev};
3355     my $c = $self->{column_prev};
3356 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3357     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3358     ($self->{is_xml} and $code == 0x0000)) {
3359 wakaba 1.1 !!!cp (1008);
3360     !!!parse-error (type => 'invalid character reference',
3361     text => (sprintf 'U+%04X', $code),
3362     line => $l, column => $c);
3363     $code = $charref_map->{$code};
3364     } elsif ($code > 0x10FFFF) {
3365     !!!cp (1009);
3366     !!!parse-error (type => 'invalid character reference',
3367     text => (sprintf 'U-%08X', $code),
3368     line => $l, column => $c);
3369     $code = 0xFFFD;
3370     }
3371    
3372     if ($self->{prev_state} == DATA_STATE) {
3373     !!!cp (988);
3374     $self->{state} = $self->{prev_state};
3375 wakaba 1.5 $self->{s_kwd} = '';
3376 wakaba 1.1 ## Reconsume.
3377     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3378 wakaba 1.7 has_reference => 1,
3379 wakaba 1.1 line => $l, column => $c,
3380     });
3381     redo A;
3382     } else {
3383     !!!cp (987);
3384     $self->{ca}->{value} .= chr $code;
3385     $self->{ca}->{has_reference} = 1;
3386     $self->{state} = $self->{prev_state};
3387 wakaba 1.5 $self->{s_kwd} = '';
3388 wakaba 1.1 ## Reconsume.
3389     redo A;
3390     }
3391     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3392 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3393     $self->{nc} <= 0x005A) or # x
3394     (0x0061 <= $self->{nc} and # a
3395     $self->{nc} <= 0x007A) or # z
3396     (0x0030 <= $self->{nc} and # 0
3397     $self->{nc} <= 0x0039) or # 9
3398 wakaba 1.22 $self->{nc} == 0x003B or # ;
3399     ($self->{is_xml} and
3400     not ($is_space->{$self->{nc}} or
3401     {
3402     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3403     $self->{entity_add} => 1,
3404     }->{$self->{nc}}))) {
3405 wakaba 1.1 our $EntityChar;
3406 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3407 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3408     $self->{ge}->{$self->{kwd}}) {
3409 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3410 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3411     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3412     !!!cp (1020.1);
3413     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3414     } else {
3415     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3416     !!!cp (1020.2);
3417     !!!parse-error (type => 'unparsed entity', ## TODO: type
3418     value => $self->{kwd});
3419     } else {
3420     !!!cp (1020.3);
3421     }
3422     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3423     }
3424     } else {
3425     if ($self->{is_xml}) {
3426     !!!cp (1020.4);
3427     !!!parse-error (type => 'entity not declared', ## TODO: type
3428     value => $self->{kwd},
3429     level => {
3430     'amp;' => $self->{level}->{warn},
3431     'quot;' => $self->{level}->{warn},
3432     'lt;' => $self->{level}->{warn},
3433     'gt;' => $self->{level}->{warn},
3434     'apos;' => $self->{level}->{warn},
3435     }->{$self->{kwd}} ||
3436     $self->{level}->{must});
3437     } else {
3438     !!!cp (1020);
3439     }
3440     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3441     }
3442 wakaba 1.1 $self->{entity__match} = 1;
3443     !!!next-input-character;
3444     #
3445     } else {
3446     !!!cp (1021);
3447 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3448 wakaba 1.1 $self->{entity__match} = -1;
3449     ## Stay in the state.
3450     !!!next-input-character;
3451     redo A;
3452     }
3453     } else {
3454     !!!cp (1022);
3455     $self->{entity__value} .= chr $self->{nc};
3456     $self->{entity__match} *= 2;
3457     ## Stay in the state.
3458     !!!next-input-character;
3459     redo A;
3460     }
3461     }
3462    
3463     my $data;
3464     my $has_ref;
3465     if ($self->{entity__match} > 0) {
3466     !!!cp (1023);
3467     $data = $self->{entity__value};
3468     $has_ref = 1;
3469     #
3470     } elsif ($self->{entity__match} < 0) {
3471     !!!parse-error (type => 'no refc');
3472     if ($self->{prev_state} != DATA_STATE and # in attribute
3473     $self->{entity__match} < -1) {
3474     !!!cp (1024);
3475 wakaba 1.12 $data = '&' . $self->{kwd};
3476 wakaba 1.1 #
3477     } else {
3478     !!!cp (1025);
3479     $data = $self->{entity__value};
3480     $has_ref = 1;
3481     #
3482     }
3483     } else {
3484     !!!cp (1026);
3485     !!!parse-error (type => 'bare ero',
3486     line => $self->{line_prev},
3487 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3488     $data = '&' . $self->{kwd};
3489 wakaba 1.1 #
3490     }
3491    
3492     ## NOTE: In these cases, when a character reference is found,
3493     ## it is consumed and a character token is returned, or, otherwise,
3494     ## nothing is consumed and returned, according to the spec algorithm.
3495     ## In this implementation, anything that has been examined by the
3496     ## tokenizer is appended to the parent element or the attribute value
3497     ## as string, either literal string when no character reference or
3498     ## entity-replaced string otherwise, in this stage, since any characters
3499     ## that would not be consumed are appended in the data state or in an
3500     ## appropriate attribute value state anyway.
3501    
3502     if ($self->{prev_state} == DATA_STATE) {
3503     !!!cp (986);
3504     $self->{state} = $self->{prev_state};
3505 wakaba 1.5 $self->{s_kwd} = '';
3506 wakaba 1.1 ## Reconsume.
3507     !!!emit ({type => CHARACTER_TOKEN,
3508     data => $data,
3509 wakaba 1.7 has_reference => $has_ref,
3510 wakaba 1.1 line => $self->{line_prev},
3511 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3512 wakaba 1.1 });
3513     redo A;
3514     } else {
3515     !!!cp (985);
3516     $self->{ca}->{value} .= $data;
3517     $self->{ca}->{has_reference} = 1 if $has_ref;
3518     $self->{state} = $self->{prev_state};
3519 wakaba 1.5 $self->{s_kwd} = '';
3520 wakaba 1.1 ## Reconsume.
3521     redo A;
3522     }
3523 wakaba 1.8
3524     ## XML-only states
3525    
3526     } elsif ($self->{state} == PI_STATE) {
3527 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3528    
3529 wakaba 1.8 if ($is_space->{$self->{nc}} or
3530 wakaba 1.14 $self->{nc} == 0x003F or # ?
3531 wakaba 1.8 $self->{nc} == -1) {
3532 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3533     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3534     ## "DOCTYPE pi state": Parse error, switch to the "data
3535     ## state".
3536 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3537     line => $self->{line_prev},
3538     column => $self->{column_prev}
3539     - 1 * ($self->{nc} != -1));
3540     $self->{state} = BOGUS_COMMENT_STATE;
3541     ## Reconsume.
3542     $self->{ct} = {type => COMMENT_TOKEN,
3543     data => '?',
3544     line => $self->{line_prev},
3545     column => $self->{column_prev}
3546     - 1 * ($self->{nc} != -1),
3547     };
3548     redo A;
3549     } else {
3550 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3551 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3552     target => chr $self->{nc},
3553     data => '',
3554     line => $self->{line_prev},
3555     column => $self->{column_prev} - 1,
3556     };
3557     $self->{state} = PI_TARGET_STATE;
3558     !!!next-input-character;
3559     redo A;
3560     }
3561     } elsif ($self->{state} == PI_TARGET_STATE) {
3562     if ($is_space->{$self->{nc}}) {
3563     $self->{state} = PI_TARGET_AFTER_STATE;
3564     !!!next-input-character;
3565     redo A;
3566     } elsif ($self->{nc} == -1) {
3567     !!!parse-error (type => 'no pic'); ## TODO: type
3568 wakaba 1.13 if ($self->{in_subset}) {
3569     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3570     } else {
3571     $self->{state} = DATA_STATE;
3572     $self->{s_kwd} = '';
3573     }
3574 wakaba 1.8 ## Reconsume.
3575     !!!emit ($self->{ct}); # pi
3576     redo A;
3577     } elsif ($self->{nc} == 0x003F) { # ?
3578     $self->{state} = PI_AFTER_STATE;
3579     !!!next-input-character;
3580     redo A;
3581     } else {
3582     ## XML5: typo ("tag name" -> "target")
3583     $self->{ct}->{target} .= chr $self->{nc}; # pi
3584     !!!next-input-character;
3585     redo A;
3586     }
3587     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3588     if ($is_space->{$self->{nc}}) {
3589     ## Stay in the state.
3590     !!!next-input-character;
3591     redo A;
3592     } else {
3593     $self->{state} = PI_DATA_STATE;
3594     ## Reprocess.
3595     redo A;
3596     }
3597     } elsif ($self->{state} == PI_DATA_STATE) {
3598     if ($self->{nc} == 0x003F) { # ?
3599     $self->{state} = PI_DATA_AFTER_STATE;
3600     !!!next-input-character;
3601     redo A;
3602     } elsif ($self->{nc} == -1) {
3603     !!!parse-error (type => 'no pic'); ## TODO: type
3604 wakaba 1.13 if ($self->{in_subset}) {
3605 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3606 wakaba 1.13 } else {
3607     $self->{state} = DATA_STATE;
3608     $self->{s_kwd} = '';
3609     }
3610 wakaba 1.8 ## Reprocess.
3611     !!!emit ($self->{ct}); # pi
3612     redo A;
3613     } else {
3614     $self->{ct}->{data} .= chr $self->{nc}; # pi
3615     $self->{read_until}->($self->{ct}->{data}, q[?],
3616     length $self->{ct}->{data});
3617     ## Stay in the state.
3618     !!!next-input-character;
3619     ## Reprocess.
3620     redo A;
3621     }
3622     } elsif ($self->{state} == PI_AFTER_STATE) {
3623 wakaba 1.14 ## XML5: Part of "Pi after state".
3624    
3625 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3626 wakaba 1.13 if ($self->{in_subset}) {
3627     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3628     } else {
3629     $self->{state} = DATA_STATE;
3630     $self->{s_kwd} = '';
3631     }
3632 wakaba 1.8 !!!next-input-character;
3633     !!!emit ($self->{ct}); # pi
3634     redo A;
3635     } elsif ($self->{nc} == 0x003F) { # ?
3636     !!!parse-error (type => 'no s after target', ## TODO: type
3637     line => $self->{line_prev},
3638     column => $self->{column_prev}); ## XML5: no error
3639     $self->{ct}->{data} .= '?';
3640     $self->{state} = PI_DATA_AFTER_STATE;
3641     !!!next-input-character;
3642     redo A;
3643     } else {
3644     !!!parse-error (type => 'no s after target', ## TODO: type
3645     line => $self->{line_prev},
3646     column => $self->{column_prev}
3647     + 1 * ($self->{nc} == -1)); ## XML5: no error
3648     $self->{ct}->{data} .= '?'; ## XML5: not appended
3649     $self->{state} = PI_DATA_STATE;
3650     ## Reprocess.
3651     redo A;
3652     }
3653     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3654 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3655    
3656 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3657 wakaba 1.13 if ($self->{in_subset}) {
3658     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3659     } else {
3660     $self->{state} = DATA_STATE;
3661     $self->{s_kwd} = '';
3662     }
3663 wakaba 1.8 !!!next-input-character;
3664     !!!emit ($self->{ct}); # pi
3665     redo A;
3666     } elsif ($self->{nc} == 0x003F) { # ?
3667     $self->{ct}->{data} .= '?';
3668     ## Stay in the state.
3669     !!!next-input-character;
3670     redo A;
3671     } else {
3672     $self->{ct}->{data} .= '?'; ## XML5: not appended
3673     $self->{state} = PI_DATA_STATE;
3674     ## Reprocess.
3675     redo A;
3676     }
3677 wakaba 1.12
3678     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3679     if ($self->{nc} == 0x003C) { # <
3680 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3681 wakaba 1.12 !!!next-input-character;
3682     redo A;
3683     } elsif ($self->{nc} == 0x0025) { # %
3684     ## XML5: Not defined yet.
3685    
3686     ## TODO:
3687 wakaba 1.24
3688     if (not $self->{stop_processing} and
3689     not $self->{document}->xml_standalone) {
3690     !!!parse-error (type => 'stop processing', ## TODO: type
3691     level => $self->{level}->{info});
3692     $self->{stop_processing} = 1;
3693     }
3694    
3695 wakaba 1.12 !!!next-input-character;
3696     redo A;
3697     } elsif ($self->{nc} == 0x005D) { # ]
3698 wakaba 1.13 delete $self->{in_subset};
3699 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3700     !!!next-input-character;
3701     redo A;
3702     } elsif ($is_space->{$self->{nc}}) {
3703     ## Stay in the state.
3704     !!!next-input-character;
3705     redo A;
3706     } elsif ($self->{nc} == -1) {
3707     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3708 wakaba 1.13 delete $self->{in_subset};
3709 wakaba 1.12 $self->{state} = DATA_STATE;
3710     $self->{s_kwd} = '';
3711     ## Reconsume.
3712 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3713 wakaba 1.12 redo A;
3714     } else {
3715     unless ($self->{internal_subset_tainted}) {
3716     ## XML5: No parse error.
3717     !!!parse-error (type => 'string in internal subset');
3718     $self->{internal_subset_tainted} = 1;
3719     }
3720     ## Stay in the state.
3721     !!!next-input-character;
3722     redo A;
3723     }
3724     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3725     if ($self->{nc} == 0x003E) { # >
3726     $self->{state} = DATA_STATE;
3727     $self->{s_kwd} = '';
3728     !!!next-input-character;
3729 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3730 wakaba 1.12 redo A;
3731     } elsif ($self->{nc} == -1) {
3732     !!!parse-error (type => 'unclosed DOCTYPE');
3733     $self->{state} = DATA_STATE;
3734     $self->{s_kwd} = '';
3735     ## Reconsume.
3736 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3737 wakaba 1.12 redo A;
3738     } else {
3739     ## XML5: No parse error and stay in the state.
3740     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3741    
3742 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3743     !!!next-input-character;
3744     redo A;
3745     }
3746     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3747     if ($self->{nc} == 0x003E) { # >
3748     $self->{state} = DATA_STATE;
3749     $self->{s_kwd} = '';
3750     !!!next-input-character;
3751     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3752     redo A;
3753     } elsif ($self->{nc} == -1) {
3754     $self->{state} = DATA_STATE;
3755     $self->{s_kwd} = '';
3756     ## Reconsume.
3757     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3758     redo A;
3759     } else {
3760     ## Stay in the state.
3761     !!!next-input-character;
3762     redo A;
3763     }
3764     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3765     if ($self->{nc} == 0x0021) { # !
3766 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3767 wakaba 1.13 !!!next-input-character;
3768     redo A;
3769     } elsif ($self->{nc} == 0x003F) { # ?
3770     $self->{state} = PI_STATE;
3771     !!!next-input-character;
3772     redo A;
3773     } elsif ($self->{nc} == -1) {
3774     !!!parse-error (type => 'bare stago');
3775     $self->{state} = DATA_STATE;
3776     $self->{s_kwd} = '';
3777     ## Reconsume.
3778     redo A;
3779     } else {
3780     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3781     line => $self->{line_prev},
3782     column => $self->{column_prev});
3783     $self->{state} = BOGUS_COMMENT_STATE;
3784     $self->{ct} = {type => COMMENT_TOKEN,
3785     data => '',
3786     }; ## NOTE: Will be discarded.
3787 wakaba 1.12 !!!next-input-character;
3788     redo A;
3789     }
3790 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3791     ## XML5: "DOCTYPE markup declaration state".
3792    
3793     if ($self->{nc} == 0x002D) { # -
3794     $self->{state} = MD_HYPHEN_STATE;
3795     !!!next-input-character;
3796     redo A;
3797 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3798     $self->{nc} == 0x0065) { # e
3799 wakaba 1.14 $self->{state} = MD_E_STATE;
3800     $self->{kwd} = chr $self->{nc};
3801     !!!next-input-character;
3802     redo A;
3803 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3804     $self->{nc} == 0x0061) { # a
3805 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3806     $self->{kwd} = chr $self->{nc};
3807     !!!next-input-character;
3808     redo A;
3809 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3810     $self->{nc} == 0x006E) { # n
3811 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3812     $self->{kwd} = chr $self->{nc};
3813     !!!next-input-character;
3814     redo A;
3815     } else {
3816     #
3817     }
3818    
3819     ## XML5: No parse error.
3820     !!!parse-error (type => 'bogus comment',
3821     line => $self->{line_prev},
3822     column => $self->{column_prev} - 1);
3823     ## Reconsume.
3824     $self->{state} = BOGUS_COMMENT_STATE;
3825     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3826     redo A;
3827     } elsif ($self->{state} == MD_E_STATE) {
3828 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3829     $self->{nc} == 0x006E) { # n
3830 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3831     $self->{kwd} .= chr $self->{nc};
3832     !!!next-input-character;
3833     redo A;
3834 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3835     $self->{nc} == 0x006C) { # l
3836 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3837     $self->{state} = MD_ELEMENT_STATE;
3838     $self->{kwd} .= chr $self->{nc};
3839     !!!next-input-character;
3840     redo A;
3841     } else {
3842     ## XML5: No parse error.
3843     !!!parse-error (type => 'bogus comment',
3844     line => $self->{line_prev},
3845     column => $self->{column_prev} - 2
3846     + 1 * ($self->{nc} == -1));
3847     ## Reconsume.
3848     $self->{state} = BOGUS_COMMENT_STATE;
3849     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850     redo A;
3851     }
3852     } elsif ($self->{state} == MD_ENTITY_STATE) {
3853 wakaba 1.17 if ($self->{nc} == [
3854     undef,
3855     undef,
3856     0x0054, # T
3857     0x0049, # I
3858     0x0054, # T
3859     ]->[length $self->{kwd}] or
3860     $self->{nc} == [
3861     undef,
3862     undef,
3863     0x0074, # t
3864     0x0069, # i
3865     0x0074, # t
3866     ]->[length $self->{kwd}]) {
3867 wakaba 1.14 ## Stay in the state.
3868     $self->{kwd} .= chr $self->{nc};
3869     !!!next-input-character;
3870     redo A;
3871 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3872     ($self->{nc} == 0x0059 or # Y
3873     $self->{nc} == 0x0079)) { # y
3874     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3875     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3876     text => 'ENTITY',
3877     line => $self->{line_prev},
3878     column => $self->{column_prev} - 4);
3879     }
3880     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3881 wakaba 1.14 line => $self->{line_prev},
3882     column => $self->{column_prev} - 6};
3883     $self->{state} = DOCTYPE_MD_STATE;
3884     !!!next-input-character;
3885     redo A;
3886     } else {
3887     !!!parse-error (type => 'bogus comment',
3888     line => $self->{line_prev},
3889     column => $self->{column_prev} - 1
3890     - (length $self->{kwd})
3891     + 1 * ($self->{nc} == -1));
3892     $self->{state} = BOGUS_COMMENT_STATE;
3893     ## Reconsume.
3894     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3895     redo A;
3896     }
3897     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3898 wakaba 1.17 if ($self->{nc} == [
3899     undef,
3900     undef,
3901     0x0045, # E
3902     0x004D, # M
3903     0x0045, # E
3904     0x004E, # N
3905     ]->[length $self->{kwd}] or
3906     $self->{nc} == [
3907     undef,
3908     undef,
3909     0x0065, # e
3910     0x006D, # m
3911     0x0065, # e
3912     0x006E, # n
3913     ]->[length $self->{kwd}]) {
3914 wakaba 1.14 ## Stay in the state.
3915     $self->{kwd} .= chr $self->{nc};
3916     !!!next-input-character;
3917     redo A;
3918 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3919     ($self->{nc} == 0x0054 or # T
3920     $self->{nc} == 0x0074)) { # t
3921     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3922     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3923     text => 'ELEMENT',
3924     line => $self->{line_prev},
3925     column => $self->{column_prev} - 5);
3926     }
3927 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3928     line => $self->{line_prev},
3929 wakaba 1.23 column => $self->{column_prev} - 7};
3930 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3931     !!!next-input-character;
3932     redo A;
3933     } else {
3934     !!!parse-error (type => 'bogus comment',
3935     line => $self->{line_prev},
3936     column => $self->{column_prev} - 1
3937     - (length $self->{kwd})
3938     + 1 * ($self->{nc} == -1));
3939     $self->{state} = BOGUS_COMMENT_STATE;
3940     ## Reconsume.
3941     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3942     redo A;
3943     }
3944     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3945 wakaba 1.17 if ($self->{nc} == [
3946     undef,
3947     0x0054, # T
3948     0x0054, # T
3949     0x004C, # L
3950     0x0049, # I
3951     0x0053, # S
3952     ]->[length $self->{kwd}] or
3953     $self->{nc} == [
3954     undef,
3955     0x0074, # t
3956     0x0074, # t
3957     0x006C, # l
3958     0x0069, # i
3959     0x0073, # s
3960     ]->[length $self->{kwd}]) {
3961 wakaba 1.14 ## Stay in the state.
3962     $self->{kwd} .= chr $self->{nc};
3963     !!!next-input-character;
3964     redo A;
3965 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3966     ($self->{nc} == 0x0054 or # T
3967     $self->{nc} == 0x0074)) { # t
3968     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3969     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3970     text => 'ATTLIST',
3971     line => $self->{line_prev},
3972     column => $self->{column_prev} - 5);
3973     }
3974 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3975 wakaba 1.15 attrdefs => [],
3976 wakaba 1.14 line => $self->{line_prev},
3977 wakaba 1.23 column => $self->{column_prev} - 7};
3978 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3979     !!!next-input-character;
3980     redo A;
3981     } else {
3982     !!!parse-error (type => 'bogus comment',
3983     line => $self->{line_prev},
3984     column => $self->{column_prev} - 1
3985     - (length $self->{kwd})
3986     + 1 * ($self->{nc} == -1));
3987     $self->{state} = BOGUS_COMMENT_STATE;
3988     ## Reconsume.
3989     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3990     redo A;
3991     }
3992     } elsif ($self->{state} == MD_NOTATION_STATE) {
3993 wakaba 1.17 if ($self->{nc} == [
3994     undef,
3995     0x004F, # O
3996     0x0054, # T
3997     0x0041, # A
3998     0x0054, # T
3999     0x0049, # I
4000     0x004F, # O
4001     ]->[length $self->{kwd}] or
4002     $self->{nc} == [
4003     undef,
4004     0x006F, # o
4005     0x0074, # t
4006     0x0061, # a
4007     0x0074, # t
4008     0x0069, # i
4009     0x006F, # o
4010     ]->[length $self->{kwd}]) {
4011 wakaba 1.14 ## Stay in the state.
4012     $self->{kwd} .= chr $self->{nc};
4013     !!!next-input-character;
4014     redo A;
4015 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4016     ($self->{nc} == 0x004E or # N
4017     $self->{nc} == 0x006E)) { # n
4018     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4019     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4020     text => 'NOTATION',
4021     line => $self->{line_prev},
4022     column => $self->{column_prev} - 6);
4023     }
4024 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4025     line => $self->{line_prev},
4026 wakaba 1.23 column => $self->{column_prev} - 8};
4027 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4028     !!!next-input-character;
4029     redo A;
4030     } else {
4031     !!!parse-error (type => 'bogus comment',
4032     line => $self->{line_prev},
4033     column => $self->{column_prev} - 1
4034     - (length $self->{kwd})
4035     + 1 * ($self->{nc} == -1));
4036     $self->{state} = BOGUS_COMMENT_STATE;
4037     ## Reconsume.
4038     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4039     redo A;
4040     }
4041     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4042     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4043     ## "DOCTYPE NOTATION state".
4044    
4045     if ($is_space->{$self->{nc}}) {
4046     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4047     $self->{state} = BEFORE_MD_NAME_STATE;
4048     !!!next-input-character;
4049     redo A;
4050     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4051     $self->{nc} == 0x0025) { # %
4052     ## XML5: Switch to the "DOCTYPE bogus comment state".
4053     !!!parse-error (type => 'no space before md name'); ## TODO: type
4054     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4055     !!!next-input-character;
4056     redo A;
4057     } elsif ($self->{nc} == -1) {
4058     !!!parse-error (type => 'unclosed md'); ## TODO: type
4059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4060     ## Reconsume.
4061     redo A;
4062     } elsif ($self->{nc} == 0x003E) { # >
4063     ## XML5: Switch to the "DOCTYPE bogus comment state".
4064     !!!parse-error (type => 'no md name'); ## TODO: type
4065     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066     !!!next-input-character;
4067     redo A;
4068     } else {
4069     ## XML5: Switch to the "DOCTYPE bogus comment state".
4070     !!!parse-error (type => 'no space before md name'); ## TODO: type
4071     $self->{state} = BEFORE_MD_NAME_STATE;
4072     redo A;
4073     }
4074     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4075     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4076     ## before state", "DOCTYPE ATTLIST name before state".
4077    
4078     if ($is_space->{$self->{nc}}) {
4079     ## Stay in the state.
4080     !!!next-input-character;
4081     redo A;
4082     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4083     $self->{nc} == 0x0025) { # %
4084     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4085     !!!next-input-character;
4086     redo A;
4087     } elsif ($self->{nc} == 0x003E) { # >
4088     ## XML5: Same as "Anything else".
4089     !!!parse-error (type => 'no md name'); ## TODO: type
4090     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091     !!!next-input-character;
4092     redo A;
4093     } elsif ($self->{nc} == -1) {
4094     !!!parse-error (type => 'unclosed md'); ## TODO: type
4095     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4096     ## Reconsume.
4097     redo A;
4098     } else {
4099     ## XML5: [ATTLIST] Not defined yet.
4100     $self->{ct}->{name} .= chr $self->{nc};
4101     $self->{state} = MD_NAME_STATE;
4102     !!!next-input-character;
4103     redo A;
4104     }
4105     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4106     if ($is_space->{$self->{nc}}) {
4107     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4108     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4109     $self->{state} = BEFORE_MD_NAME_STATE;
4110     !!!next-input-character;
4111     redo A;
4112     } elsif ($self->{nc} == 0x003E) { # >
4113     ## XML5: Same as "Anything else".
4114     !!!parse-error (type => 'no md name'); ## TODO: type
4115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4116     !!!next-input-character;
4117     redo A;
4118     } elsif ($self->{nc} == -1) {
4119     !!!parse-error (type => 'unclosed md');
4120     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4121     ## Reconsume.
4122     redo A;
4123     } else {
4124     ## XML5: No parse error.
4125     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4126     $self->{state} = BOGUS_COMMENT_STATE;
4127     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4128     ## Reconsume.
4129     redo A;
4130     }
4131     } elsif ($self->{state} == MD_NAME_STATE) {
4132     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4133    
4134     if ($is_space->{$self->{nc}}) {
4135 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4136     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4137     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4138 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4139 wakaba 1.16 } else { # ENTITY/NOTATION
4140     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4141     }
4142 wakaba 1.14 !!!next-input-character;
4143     redo A;
4144     } elsif ($self->{nc} == 0x003E) { # >
4145     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4146     #
4147     } else {
4148 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4149 wakaba 1.14 }
4150     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4151     !!!next-input-character;
4152     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4153     redo A;
4154     } elsif ($self->{nc} == -1) {
4155     ## XML5: [ATTLIST] No parse error.
4156     !!!parse-error (type => 'unclosed md');
4157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4158     ## Reconsume.
4159     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4160     redo A;
4161     } else {
4162     ## XML5: [ATTLIST] Not defined yet.
4163     $self->{ct}->{name} .= chr $self->{nc};
4164     ## Stay in the state.
4165     !!!next-input-character;
4166     redo A;
4167     }
4168     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4169     if ($is_space->{$self->{nc}}) {
4170     ## Stay in the state.
4171     !!!next-input-character;
4172     redo A;
4173     } elsif ($self->{nc} == 0x003E) { # >
4174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4175     !!!next-input-character;
4176     !!!emit ($self->{ct}); # ATTLIST
4177     redo A;
4178     } elsif ($self->{nc} == -1) {
4179     ## XML5: No parse error.
4180     !!!parse-error (type => 'unclosed md'); ## TODO: type
4181     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4182 wakaba 1.15 !!!emit ($self->{ct});
4183     redo A;
4184     } else {
4185     ## XML5: Not defined yet.
4186     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4187     tokens => [],
4188     line => $self->{line}, column => $self->{column}};
4189     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4190     !!!next-input-character;
4191     redo A;
4192     }
4193     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4194     if ($is_space->{$self->{nc}}) {
4195     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4196     !!!next-input-character;
4197     redo A;
4198     } elsif ($self->{nc} == 0x003E) { # >
4199     ## XML5: Same as "anything else".
4200     !!!parse-error (type => 'no attr type'); ## TODO: type
4201     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4202     !!!next-input-character;
4203     !!!emit ($self->{ct}); # ATTLIST
4204     redo A;
4205     } elsif ($self->{nc} == 0x0028) { # (
4206     ## XML5: Same as "anything else".
4207     !!!parse-error (type => 'no space before paren'); ## TODO: type
4208     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4209     !!!next-input-character;
4210     redo A;
4211     } elsif ($self->{nc} == -1) {
4212     ## XML5: No parse error.
4213     !!!parse-error (type => 'unclosed md'); ## TODO: type
4214     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4215     !!!next-input-character;
4216     !!!emit ($self->{ct}); # ATTLIST
4217     redo A;
4218     } else {
4219     ## XML5: Not defined yet.
4220     $self->{ca}->{name} .= chr $self->{nc};
4221     ## Stay in the state.
4222     !!!next-input-character;
4223     redo A;
4224     }
4225     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4226     if ($is_space->{$self->{nc}}) {
4227     ## Stay in the state.
4228     !!!next-input-character;
4229     redo A;
4230     } elsif ($self->{nc} == 0x003E) { # >
4231     ## XML5: Same as "anything else".
4232     !!!parse-error (type => 'no attr type'); ## TODO: type
4233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234     !!!next-input-character;
4235     !!!emit ($self->{ct}); # ATTLIST
4236     redo A;
4237     } elsif ($self->{nc} == 0x0028) { # (
4238     ## XML5: Same as "anything else".
4239     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4240     !!!next-input-character;
4241     redo A;
4242     } elsif ($self->{nc} == -1) {
4243     ## XML5: No parse error.
4244     !!!parse-error (type => 'unclosed md'); ## TODO: type
4245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4246     !!!next-input-character;
4247     !!!emit ($self->{ct});
4248 wakaba 1.14 redo A;
4249     } else {
4250     ## XML5: Not defined yet.
4251 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4252     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4253     !!!next-input-character;
4254     redo A;
4255     }
4256     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4257     if ($is_space->{$self->{nc}}) {
4258     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4259     !!!next-input-character;
4260     redo A;
4261     } elsif ($self->{nc} == 0x0023) { # #
4262     ## XML5: Same as "anything else".
4263     !!!parse-error (type => 'no space before default value'); ## TODO: type
4264     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4265     !!!next-input-character;
4266     redo A;
4267     } elsif ($self->{nc} == 0x0022) { # "
4268     ## XML5: Same as "anything else".
4269     !!!parse-error (type => 'no space before default value'); ## TODO: type
4270     $self->{ca}->{value} = '';
4271     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4272     !!!next-input-character;
4273     redo A;
4274     } elsif ($self->{nc} == 0x0027) { # '
4275     ## XML5: Same as "anything else".
4276     !!!parse-error (type => 'no space before default value'); ## TODO: type
4277     $self->{ca}->{value} = '';
4278     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4279     !!!next-input-character;
4280     redo A;
4281     } elsif ($self->{nc} == 0x003E) { # >
4282     ## XML5: Same as "anything else".
4283     !!!parse-error (type => 'no attr default'); ## TODO: type
4284     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4285     !!!next-input-character;
4286     !!!emit ($self->{ct}); # ATTLIST
4287     redo A;
4288     } elsif ($self->{nc} == 0x0028) { # (
4289     ## XML5: Same as "anything else".
4290     !!!parse-error (type => 'no space before paren'); ## TODO: type
4291     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4292     !!!next-input-character;
4293     redo A;
4294     } elsif ($self->{nc} == -1) {
4295     ## XML5: No parse error.
4296     !!!parse-error (type => 'unclosed md'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4298     !!!next-input-character;
4299     !!!emit ($self->{ct});
4300     redo A;
4301     } else {
4302     ## XML5: Not defined yet.
4303     $self->{ca}->{type} .= chr $self->{nc};
4304     ## Stay in the state.
4305     !!!next-input-character;
4306     redo A;
4307     }
4308     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4309     if ($is_space->{$self->{nc}}) {
4310     ## Stay in the state.
4311     !!!next-input-character;
4312     redo A;
4313     } elsif ($self->{nc} == 0x0028) { # (
4314     ## XML5: Same as "anything else".
4315     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4316     !!!next-input-character;
4317     redo A;
4318     } elsif ($self->{nc} == 0x0023) { # #
4319     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4320     !!!next-input-character;
4321     redo A;
4322     } elsif ($self->{nc} == 0x0022) { # "
4323     ## XML5: Same as "anything else".
4324     $self->{ca}->{value} = '';
4325     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4326     !!!next-input-character;
4327     redo A;
4328     } elsif ($self->{nc} == 0x0027) { # '
4329     ## XML5: Same as "anything else".
4330     $self->{ca}->{value} = '';
4331     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4332     !!!next-input-character;
4333     redo A;
4334     } elsif ($self->{nc} == 0x003E) { # >
4335     ## XML5: Same as "anything else".
4336     !!!parse-error (type => 'no attr default'); ## TODO: type
4337     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4338     !!!next-input-character;
4339     !!!emit ($self->{ct}); # ATTLIST
4340     redo A;
4341     } elsif ($self->{nc} == -1) {
4342     ## XML5: No parse error.
4343     !!!parse-error (type => 'unclosed md'); ## TODO: type
4344     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4345     !!!next-input-character;
4346     !!!emit ($self->{ct});
4347     redo A;
4348     } else {
4349     ## XML5: Switch to the "DOCTYPE bogus comment state".
4350     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4351     $self->{ca}->{value} = '';
4352     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4353     ## Reconsume.
4354     redo A;
4355     }
4356     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4357     if ($is_space->{$self->{nc}}) {
4358     ## Stay in the state.
4359     !!!next-input-character;
4360     redo A;
4361     } elsif ($self->{nc} == 0x007C) { # |
4362     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4363     ## Stay in the state.
4364     !!!next-input-character;
4365     redo A;
4366     } elsif ($self->{nc} == 0x0029) { # )
4367     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4368     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4369     !!!next-input-character;
4370     redo A;
4371     } elsif ($self->{nc} == 0x003E) { # >
4372     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4374     !!!next-input-character;
4375     !!!emit ($self->{ct}); # ATTLIST
4376     redo A;
4377     } elsif ($self->{nc} == -1) {
4378     ## XML5: No parse error.
4379     !!!parse-error (type => 'unclosed md'); ## TODO: type
4380     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4381     !!!next-input-character;
4382     !!!emit ($self->{ct});
4383     redo A;
4384     } else {
4385     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4386     $self->{state} = ALLOWED_TOKEN_STATE;
4387     !!!next-input-character;
4388     redo A;
4389     }
4390     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4391     if ($is_space->{$self->{nc}}) {
4392     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4393     !!!next-input-character;
4394     redo A;
4395     } elsif ($self->{nc} == 0x007C) { # |
4396     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4397     !!!next-input-character;
4398     redo A;
4399     } elsif ($self->{nc} == 0x0029) { # )
4400     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4401     !!!next-input-character;
4402     redo A;
4403     } elsif ($self->{nc} == 0x003E) { # >
4404     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4406     !!!next-input-character;
4407     !!!emit ($self->{ct}); # ATTLIST
4408     redo A;
4409     } elsif ($self->{nc} == -1) {
4410     ## XML5: No parse error.
4411     !!!parse-error (type => 'unclosed md'); ## TODO: type
4412     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4413     !!!next-input-character;
4414     !!!emit ($self->{ct});
4415     redo A;
4416     } else {
4417     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4418     ## Stay in the state.
4419     !!!next-input-character;
4420     redo A;
4421     }
4422     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4423     if ($is_space->{$self->{nc}}) {
4424     ## Stay in the state.
4425     !!!next-input-character;
4426     redo A;
4427     } elsif ($self->{nc} == 0x007C) { # |
4428     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4429     !!!next-input-character;
4430     redo A;
4431     } elsif ($self->{nc} == 0x0029) { # )
4432     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4433     !!!next-input-character;
4434     redo A;
4435     } elsif ($self->{nc} == 0x003E) { # >
4436     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4437     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4438     !!!next-input-character;
4439     !!!emit ($self->{ct}); # ATTLIST
4440     redo A;
4441     } elsif ($self->{nc} == -1) {
4442     ## XML5: No parse error.
4443     !!!parse-error (type => 'unclosed md'); ## TODO: type
4444     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4445     !!!next-input-character;
4446     !!!emit ($self->{ct});
4447     redo A;
4448     } else {
4449     !!!parse-error (type => 'space in allowed token', ## TODO: type
4450     line => $self->{line_prev},
4451     column => $self->{column_prev});
4452     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4453     $self->{state} = ALLOWED_TOKEN_STATE;
4454     !!!next-input-character;
4455     redo A;
4456     }
4457     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4458     if ($is_space->{$self->{nc}}) {
4459     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4460     !!!next-input-character;
4461     redo A;
4462     } elsif ($self->{nc} == 0x0023) { # #
4463     !!!parse-error (type => 'no space before default value'); ## TODO: type
4464     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4465     !!!next-input-character;
4466     redo A;
4467     } elsif ($self->{nc} == 0x0022) { # "
4468     !!!parse-error (type => 'no space before default value'); ## TODO: type
4469     $self->{ca}->{value} = '';
4470     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4471     !!!next-input-character;
4472     redo A;
4473     } elsif ($self->{nc} == 0x0027) { # '
4474     !!!parse-error (type => 'no space before default value'); ## TODO: type
4475     $self->{ca}->{value} = '';
4476     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4477     !!!next-input-character;
4478     redo A;
4479     } elsif ($self->{nc} == 0x003E) { # >
4480     !!!parse-error (type => 'no attr default'); ## TODO: type
4481     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4482     !!!next-input-character;
4483     !!!emit ($self->{ct}); # ATTLIST
4484     redo A;
4485     } elsif ($self->{nc} == -1) {
4486     !!!parse-error (type => 'unclosed md'); ## TODO: type
4487     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4488     !!!next-input-character;
4489     !!!emit ($self->{ct});
4490     redo A;
4491     } else {
4492     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4493     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4494     ## Reconsume.
4495     redo A;
4496     }
4497     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4498     if ($is_space->{$self->{nc}}) {
4499     ## Stay in the state.
4500     !!!next-input-character;
4501     redo A;
4502     } elsif ($self->{nc} == 0x0023) { # #
4503     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4504     !!!next-input-character;
4505     redo A;
4506     } elsif ($self->{nc} == 0x0022) { # "
4507     $self->{ca}->{value} = '';
4508     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4509     !!!next-input-character;
4510     redo A;
4511     } elsif ($self->{nc} == 0x0027) { # '
4512     $self->{ca}->{value} = '';
4513     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4514     !!!next-input-character;
4515     redo A;
4516     } elsif ($self->{nc} == 0x003E) { # >
4517     !!!parse-error (type => 'no attr default'); ## TODO: type
4518     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4519     !!!next-input-character;
4520     !!!emit ($self->{ct}); # ATTLIST
4521     redo A;
4522     } elsif ($self->{nc} == -1) {
4523     !!!parse-error (type => 'unclosed md'); ## TODO: type
4524     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4525     !!!next-input-character;
4526     !!!emit ($self->{ct});
4527     redo A;
4528     } else {
4529     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4530     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4531     ## Reconsume.
4532     redo A;
4533     }
4534     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4535     if ($is_space->{$self->{nc}}) {
4536     ## XML5: No parse error.
4537     !!!parse-error (type => 'no default type'); ## TODO: type
4538 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4539 wakaba 1.14 ## Reconsume.
4540     redo A;
4541 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4542     ## XML5: Same as "anything else".
4543     $self->{ca}->{value} = '';
4544     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4545     !!!next-input-character;
4546     redo A;
4547     } elsif ($self->{nc} == 0x0027) { # '
4548     ## XML5: Same as "anything else".
4549     $self->{ca}->{value} = '';
4550     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4551     !!!next-input-character;
4552     redo A;
4553     } elsif ($self->{nc} == 0x003E) { # >
4554     ## XML5: Same as "anything else".
4555     !!!parse-error (type => 'no attr default'); ## TODO: type
4556     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4557     !!!next-input-character;
4558     !!!emit ($self->{ct}); # ATTLIST
4559     redo A;
4560     } elsif ($self->{nc} == -1) {
4561     ## XML5: No parse error.
4562     !!!parse-error (type => 'unclosed md'); ## TODO: type
4563     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4564     !!!next-input-character;
4565     !!!emit ($self->{ct});
4566     redo A;
4567     } else {
4568     $self->{ca}->{default} = chr $self->{nc};
4569     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4570     !!!next-input-character;
4571     redo A;
4572 wakaba 1.14 }
4573 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4574     if ($is_space->{$self->{nc}}) {
4575     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4576     !!!next-input-character;
4577     redo A;
4578     } elsif ($self->{nc} == 0x0022) { # "
4579     ## XML5: Same as "anything else".
4580     !!!parse-error (type => 'no space before default value'); ## TODO: type
4581     $self->{ca}->{value} = '';
4582     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4583     !!!next-input-character;
4584     redo A;
4585     } elsif ($self->{nc} == 0x0027) { # '
4586     ## XML5: Same as "anything else".
4587     !!!parse-error (type => 'no space before default value'); ## TODO: type
4588     $self->{ca}->{value} = '';
4589     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4590     !!!next-input-character;
4591     redo A;
4592     } elsif ($self->{nc} == 0x003E) { # >
4593     ## XML5: Same as "anything else".
4594     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4596     !!!next-input-character;
4597     !!!emit ($self->{ct}); # ATTLIST
4598     redo A;
4599     } elsif ($self->{nc} == -1) {
4600     ## XML5: No parse error.
4601     !!!parse-error (type => 'unclosed md'); ## TODO: type
4602     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4603     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4604     !!!next-input-character;
4605     !!!emit ($self->{ct});
4606     redo A;
4607     } else {
4608     $self->{ca}->{default} .= chr $self->{nc};
4609     ## Stay in the state.
4610     !!!next-input-character;
4611     redo A;
4612     }
4613     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4614     if ($is_space->{$self->{nc}}) {
4615     ## Stay in the state.
4616     !!!next-input-character;
4617     redo A;
4618     } elsif ($self->{nc} == 0x0022) { # "
4619     $self->{ca}->{value} = '';
4620     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4621     !!!next-input-character;
4622     redo A;
4623     } elsif ($self->{nc} == 0x0027) { # '
4624     $self->{ca}->{value} = '';
4625     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4626     !!!next-input-character;
4627     redo A;
4628     } elsif ($self->{nc} == 0x003E) { # >
4629     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4630     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4631     !!!next-input-character;
4632     !!!emit ($self->{ct}); # ATTLIST
4633     redo A;
4634     } elsif ($self->{nc} == -1) {
4635     ## XML5: No parse error.
4636     !!!parse-error (type => 'unclosed md'); ## TODO: type
4637     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4638     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4639     !!!next-input-character;
4640     !!!emit ($self->{ct});
4641     redo A;
4642     } else {
4643     ## XML5: Not defined yet.
4644     if ($self->{ca}->{default} eq 'FIXED') {
4645     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4646     } else {
4647     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4648     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4649     }
4650     ## Reconsume.
4651     redo A;
4652     }
4653     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4654     if ($is_space->{$self->{nc}} or
4655     $self->{nc} == -1 or
4656     $self->{nc} == 0x003E) { # >
4657     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4658     ## Reconsume.
4659     redo A;
4660     } else {
4661     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4662     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4663     ## Reconsume.
4664     redo A;
4665 wakaba 1.16 }
4666 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4667     ## ASCII case-insensitive
4668     if ($self->{nc} == [
4669     undef,
4670     0x0044, # D
4671     0x0041, # A
4672     0x0054, # T
4673     ]->[length $self->{kwd}] or
4674     $self->{nc} == [
4675     undef,
4676     0x0064, # d
4677     0x0061, # a
4678     0x0074, # t
4679     ]->[length $self->{kwd}]) {
4680     !!!cp (172.2);
4681     ## Stay in the state.
4682     $self->{kwd} .= chr $self->{nc};
4683     !!!next-input-character;
4684     redo A;
4685     } elsif ((length $self->{kwd}) == 4 and
4686     ($self->{nc} == 0x0041 or # A
4687     $self->{nc} == 0x0061)) { # a
4688     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4689     !!!cp (172.3);
4690     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4691     text => 'NDATA',
4692     line => $self->{line_prev},
4693     column => $self->{column_prev} - 4);
4694     } else {
4695     !!!cp (172.4);
4696     }
4697     $self->{state} = AFTER_NDATA_STATE;
4698     !!!next-input-character;
4699     redo A;
4700     } else {
4701     !!!parse-error (type => 'string after literal', ## TODO: type
4702     line => $self->{line_prev},
4703     column => $self->{column_prev} + 1
4704     - length $self->{kwd});
4705     !!!cp (172.5);
4706     $self->{state} = BOGUS_MD_STATE;
4707     ## Reconsume.
4708     redo A;
4709     }
4710     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4711     if ($is_space->{$self->{nc}}) {
4712     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4713     !!!next-input-character;
4714     redo A;
4715     } elsif ($self->{nc} == 0x003E) { # >
4716     !!!parse-error (type => 'no notation name'); ## TODO: type
4717     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4718     !!!next-input-character;
4719     !!!emit ($self->{ct}); # ENTITY
4720     redo A;
4721     } elsif ($self->{nc} == -1) {
4722     !!!parse-error (type => 'unclosed md'); ## TODO: type
4723     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4724     !!!next-input-character;
4725     !!!emit ($self->{ct}); # ENTITY
4726     redo A;
4727     } else {
4728     !!!parse-error (type => 'string after literal', ## TODO: type
4729     line => $self->{line_prev},
4730     column => $self->{column_prev} + 1
4731     - length $self->{kwd});
4732     $self->{state} = BOGUS_MD_STATE;
4733     ## Reconsume.
4734     redo A;
4735     }
4736     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4737     if ($is_space->{$self->{nc}}) {
4738     ## Stay in the state.
4739     !!!next-input-character;
4740     redo A;
4741     } elsif ($self->{nc} == 0x003E) { # >
4742     !!!parse-error (type => 'no notation name'); ## TODO: type
4743     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4744     !!!next-input-character;
4745     !!!emit ($self->{ct}); # ENTITY
4746     redo A;
4747     } elsif ($self->{nc} == -1) {
4748     !!!parse-error (type => 'unclosed md'); ## TODO: type
4749     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4750     !!!next-input-character;
4751     !!!emit ($self->{ct}); # ENTITY
4752     redo A;
4753     } else {
4754     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4755     $self->{state} = NOTATION_NAME_STATE;
4756     !!!next-input-character;
4757     redo A;
4758     }
4759     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4760     if ($is_space->{$self->{nc}}) {
4761 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4762 wakaba 1.18 !!!next-input-character;
4763     redo A;
4764     } elsif ($self->{nc} == 0x003E) { # >
4765     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4766     !!!next-input-character;
4767     !!!emit ($self->{ct}); # ENTITY
4768     redo A;
4769     } elsif ($self->{nc} == -1) {
4770     !!!parse-error (type => 'unclosed md'); ## TODO: type
4771     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4772     !!!next-input-character;
4773     !!!emit ($self->{ct}); # ENTITY
4774     redo A;
4775     } else {
4776     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4777     ## Stay in the state.
4778     !!!next-input-character;
4779     redo A;
4780     }
4781 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4782     if ($self->{nc} == 0x0022) { # "
4783 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4784 wakaba 1.19 !!!next-input-character;
4785     redo A;
4786     } elsif ($self->{nc} == 0x0026) { # &
4787     $self->{prev_state} = $self->{state};
4788     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4789     $self->{entity_add} = 0x0022; # "
4790     !!!next-input-character;
4791     redo A;
4792     ## TODO: %
4793     } elsif ($self->{nc} == -1) {
4794     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4795     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4796     ## Reconsume.
4797     !!!emit ($self->{ct}); # ENTITY
4798     redo A;
4799     } else {
4800     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4801     !!!next-input-character;
4802     redo A;
4803     }
4804     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4805     if ($self->{nc} == 0x0027) { # '
4806 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4807 wakaba 1.19 !!!next-input-character;
4808     redo A;
4809     } elsif ($self->{nc} == 0x0026) { # &
4810     $self->{prev_state} = $self->{state};
4811     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4812     $self->{entity_add} = 0x0027; # '
4813     !!!next-input-character;
4814     redo A;
4815     ## TODO: %
4816     } elsif ($self->{nc} == -1) {
4817     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4818     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4819     ## Reconsume.
4820     !!!emit ($self->{ct}); # ENTITY
4821     redo A;
4822     } else {
4823     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4824     !!!next-input-character;
4825     redo A;
4826     }
4827     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4828     if ($is_space->{$self->{nc}} or
4829     {
4830     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4831     $self->{entity_add} => 1,
4832     }->{$self->{nc}}) {
4833 wakaba 1.22 !!!parse-error (type => 'bare ero',
4834     line => $self->{line_prev},
4835     column => $self->{column_prev}
4836     + ($self->{nc} == -1 ? 1 : 0));
4837 wakaba 1.19 ## Don't consume
4838     ## Return nothing.
4839     #
4840     } elsif ($self->{nc} == 0x0023) { # #
4841     $self->{ca} = $self->{ct};
4842     $self->{state} = ENTITY_HASH_STATE;
4843     $self->{kwd} = '#';
4844     !!!next-input-character;
4845     redo A;
4846     } else {
4847     #
4848     }
4849    
4850     $self->{ct}->{value} .= '&';
4851     $self->{state} = $self->{prev_state};
4852     ## Reconsume.
4853     redo A;
4854 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4855     if ($is_space->{$self->{nc}}) {
4856     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4857     !!!next-input-character;
4858     redo A;
4859     } elsif ($self->{nc} == 0x0028) { # (
4860     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4861     $self->{ct}->{content} = ['('];
4862     $self->{group_depth} = 1;
4863     !!!next-input-character;
4864     redo A;
4865     } elsif ($self->{nc} == 0x003E) { # >
4866     !!!parse-error (type => 'no md def'); ## TODO: type
4867     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4868     !!!next-input-character;
4869     !!!emit ($self->{ct}); # ELEMENT
4870     redo A;
4871     } elsif ($self->{nc} == -1) {
4872     !!!parse-error (type => 'unclosed md'); ## TODO: type
4873     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4874     !!!next-input-character;
4875     !!!emit ($self->{ct}); # ELEMENT
4876     redo A;
4877     } else {
4878     $self->{ct}->{content} = [chr $self->{nc}];
4879     $self->{state} = CONTENT_KEYWORD_STATE;
4880     !!!next-input-character;
4881     redo A;
4882     }
4883     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4884     if ($is_space->{$self->{nc}}) {
4885     $self->{state} = AFTER_MD_DEF_STATE;
4886     !!!next-input-character;
4887     redo A;
4888     } elsif ($self->{nc} == 0x003E) { # >
4889     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4890     !!!next-input-character;
4891     !!!emit ($self->{ct}); # ELEMENT
4892     redo A;
4893     } elsif ($self->{nc} == -1) {
4894     !!!parse-error (type => 'unclosed md'); ## TODO: type
4895     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4896     !!!next-input-character;
4897     !!!emit ($self->{ct}); # ELEMENT
4898     redo A;
4899     } else {
4900     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4901     ## Stay in the state.
4902     !!!next-input-character;
4903     redo A;
4904     }
4905     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4906     if ($is_space->{$self->{nc}}) {
4907     ## Stay in the state.
4908     !!!next-input-character;
4909     redo A;
4910     } elsif ($self->{nc} == 0x0028) { # (
4911     $self->{group_depth}++;
4912     push @{$self->{ct}->{content}}, chr $self->{nc};
4913     ## Stay in the state.
4914     !!!next-input-character;
4915     redo A;
4916     } elsif ($self->{nc} == 0x007C or # |
4917     $self->{nc} == 0x002C) { # ,
4918     !!!parse-error (type => 'empty element name'); ## TODO: type
4919     ## Stay in the state.
4920     !!!next-input-character;
4921     redo A;
4922     } elsif ($self->{nc} == 0x0029) { # )
4923     !!!parse-error (type => 'empty element name'); ## TODO: type
4924     push @{$self->{ct}->{content}}, chr $self->{nc};
4925     $self->{group_depth}--;
4926     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4927     !!!next-input-character;
4928     redo A;
4929     } elsif ($self->{nc} == 0x003E) { # >
4930     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4931     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4932     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4933     !!!next-input-character;
4934     !!!emit ($self->{ct}); # ELEMENT
4935     redo A;
4936     } elsif ($self->{nc} == -1) {
4937     !!!parse-error (type => 'unclosed md'); ## TODO: type
4938     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4939     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4940     !!!next-input-character;
4941     !!!emit ($self->{ct}); # ELEMENT
4942     redo A;
4943     } else {
4944     push @{$self->{ct}->{content}}, chr $self->{nc};
4945     $self->{state} = CM_ELEMENT_NAME_STATE;
4946     !!!next-input-character;
4947     redo A;
4948     }
4949     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4950     if ($is_space->{$self->{nc}}) {
4951     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4952     !!!next-input-character;
4953     redo A;
4954     } elsif ($self->{nc} == 0x002A or # *
4955     $self->{nc} == 0x002B or # +
4956     $self->{nc} == 0x003F) { # ?
4957     push @{$self->{ct}->{content}}, chr $self->{nc};
4958     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4959     !!!next-input-character;
4960     redo A;
4961     } elsif ($self->{nc} == 0x007C or # |
4962     $self->{nc} == 0x002C) { # ,
4963     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4964     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4965     !!!next-input-character;
4966     redo A;
4967     } elsif ($self->{nc} == 0x0029) { # )
4968     $self->{group_depth}--;
4969     push @{$self->{ct}->{content}}, chr $self->{nc};
4970     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4971     !!!next-input-character;
4972     redo A;
4973     } elsif ($self->{nc} == 0x003E) { # >
4974     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4975     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4976     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4977     !!!next-input-character;
4978     !!!emit ($self->{ct}); # ELEMENT
4979     redo A;
4980     } elsif ($self->{nc} == -1) {
4981     !!!parse-error (type => 'unclosed md'); ## TODO: type
4982     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4983     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4984     !!!next-input-character;
4985     !!!emit ($self->{ct}); # ELEMENT
4986     redo A;
4987     } else {
4988     $self->{ct}->{content}->[-1] .= chr $self->{nc};
4989     ## Stay in the state.
4990     !!!next-input-character;
4991     redo A;
4992     }
4993     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4994     if ($is_space->{$self->{nc}}) {
4995     ## Stay in the state.
4996     !!!next-input-character;
4997     redo A;
4998     } elsif ($self->{nc} == 0x007C or # |
4999     $self->{nc} == 0x002C) { # ,
5000     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5001     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5002     !!!next-input-character;
5003     redo A;
5004     } elsif ($self->{nc} == 0x0029) { # )
5005     $self->{group_depth}--;
5006     push @{$self->{ct}->{content}}, chr $self->{nc};
5007     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5008     !!!next-input-character;
5009     redo A;
5010     } elsif ($self->{nc} == 0x003E) { # >
5011     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5012     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5013     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5014     !!!next-input-character;
5015     !!!emit ($self->{ct}); # ELEMENT
5016     redo A;
5017     } elsif ($self->{nc} == -1) {
5018     !!!parse-error (type => 'unclosed md'); ## TODO: type
5019     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5020     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5021     !!!next-input-character;
5022     !!!emit ($self->{ct}); # ELEMENT
5023     redo A;
5024     } else {
5025     !!!parse-error (type => 'after element name'); ## TODO: type
5026     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5027     $self->{state} = BOGUS_MD_STATE;
5028     !!!next-input-character;
5029     redo A;
5030     }
5031     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5032     if ($is_space->{$self->{nc}}) {
5033     if ($self->{group_depth}) {
5034     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5035     } else {
5036     $self->{state} = AFTER_MD_DEF_STATE;
5037     }
5038     !!!next-input-character;
5039     redo A;
5040     } elsif ($self->{nc} == 0x002A or # *
5041     $self->{nc} == 0x002B or # +
5042     $self->{nc} == 0x003F) { # ?
5043     push @{$self->{ct}->{content}}, chr $self->{nc};
5044     if ($self->{group_depth}) {
5045     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5046     } else {
5047     $self->{state} = AFTER_MD_DEF_STATE;
5048     }
5049     !!!next-input-character;
5050     redo A;
5051     } elsif ($self->{nc} == 0x0029) { # )
5052     if ($self->{group_depth}) {
5053     $self->{group_depth}--;
5054     push @{$self->{ct}->{content}}, chr $self->{nc};
5055     ## Stay in the state.
5056     !!!next-input-character;
5057     redo A;
5058     } else {
5059     !!!parse-error (type => 'string after md def'); ## TODO: type
5060     $self->{state} = BOGUS_MD_STATE;
5061     ## Reconsume.
5062     redo A;
5063     }
5064     } elsif ($self->{nc} == 0x003E) { # >
5065     if ($self->{group_depth}) {
5066     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5067     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5068     }
5069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5070     !!!next-input-character;
5071     !!!emit ($self->{ct}); # ELEMENT
5072     redo A;
5073     } elsif ($self->{nc} == -1) {
5074     !!!parse-error (type => 'unclosed md'); ## TODO: type
5075     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5076     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5077     !!!next-input-character;
5078     !!!emit ($self->{ct}); # ELEMENT
5079     redo A;
5080     } else {
5081     if ($self->{group_depth}) {
5082     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5083     } else {
5084     !!!parse-error (type => 'string after md def'); ## TODO: type
5085     $self->{state} = BOGUS_MD_STATE;
5086     }
5087     ## Reconsume.
5088     redo A;
5089     }
5090     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5091 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5092     ## Stay in the state.
5093     !!!next-input-character;
5094     redo A;
5095     } elsif ($self->{nc} == 0x003E) { # >
5096     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5097     !!!next-input-character;
5098 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5099 wakaba 1.18 redo A;
5100     } elsif ($self->{nc} == -1) {
5101     !!!parse-error (type => 'unclosed md'); ## TODO: type
5102     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5103     !!!next-input-character;
5104 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5105 wakaba 1.18 redo A;
5106     } else {
5107 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5108 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5109     ## Reconsume.
5110     redo A;
5111     }
5112 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5113     if ($self->{nc} == 0x003E) { # >
5114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5115     !!!next-input-character;
5116     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5117     redo A;
5118     } elsif ($self->{nc} == -1) {
5119     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5120     ## Reconsume.
5121     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5122     redo A;
5123     } else {
5124     ## Stay in the state.
5125     !!!next-input-character;
5126     redo A;
5127     }
5128 wakaba 1.1 } else {
5129     die "$0: $self->{state}: Unknown state";
5130     }
5131     } # A
5132    
5133     die "$0: _get_next_token: unexpected case";
5134     } # _get_next_token
5135    
5136     1;
5137 wakaba 1.28 ## $Date: 2009/07/02 22:24:28 $
5138 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24