/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.26 - (hide annotations) (download) (as text)
Thu Jul 2 21:42:43 2009 UTC (15 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.25: +4 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	2 Jul 2009 21:41:03 -0000
2009-07-03  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: < in unquoted attribute value is no
	longer allowed (HTML5 revision 3206).

++ whatpm/Whatpm/HTML/ChangeLog	2 Jul 2009 21:42:34 -0000
2009-07-03  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: "<" in unquoted attribute values is now
	treated as parse error (HTML5 revision 3206).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.26 our $VERSION=do{my @r=(q$Revision: 1.25 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 wakaba 1.1 !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319 wakaba 1.25 } elsif ($self->{is_xml} and
1320     $is_space->{$self->{nc}}) {
1321     !!!cp (97.1);
1322     $self->{ca}->{value} .= ' ';
1323     ## Stay in the state.
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.1 } elsif ($self->{nc} == -1) {
1327     !!!parse-error (type => 'unclosed attribute value');
1328     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329     !!!cp (97);
1330     $self->{last_stag_name} = $self->{ct}->{tag_name};
1331 wakaba 1.15
1332     $self->{state} = DATA_STATE;
1333     $self->{s_kwd} = '';
1334     ## reconsume
1335     !!!emit ($self->{ct}); # start tag
1336     redo A;
1337 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340     !!!cp (98);
1341     !!!parse-error (type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344     !!!cp (99);
1345     }
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # end tag
1351     redo A;
1352     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353     ## XML5: No parse error above; not defined yet.
1354     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356     ## Reconsume.
1357     !!!emit ($self->{ct}); # ATTLIST
1358     redo A;
1359 wakaba 1.1 } else {
1360     die "$0: $self->{ct}->{type}: Unknown token type";
1361     }
1362     } else {
1363 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1364 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365     !!!cp (100);
1366     ## XML5: Not a parse error.
1367     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368     } else {
1369     !!!cp (100.1);
1370     }
1371 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1372     $self->{read_until}->($self->{ca}->{value},
1373 wakaba 1.25 qq["&<\x09\x0C\x20],
1374 wakaba 1.1 length $self->{ca}->{value});
1375    
1376     ## Stay in the state
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382     ## ATTLIST attribute value single quoted state".
1383 wakaba 1.11
1384 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1385 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386     !!!cp (101.1);
1387     ## XML5: "DOCTYPE ATTLIST name after state".
1388     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390     } else {
1391     !!!cp (101);
1392     ## XML5: "Before attribute name state" (sic).
1393     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394     }
1395 wakaba 1.1 !!!next-input-character;
1396     redo A;
1397     } elsif ($self->{nc} == 0x0026) { # &
1398     !!!cp (102);
1399 wakaba 1.11 ## XML5: Not defined yet.
1400    
1401 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1402     ## "entity in attribute value state". In this implementation, the
1403     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404     ## implementation of the "consume a character reference" algorithm.
1405     $self->{entity_add} = 0x0027; # '
1406     $self->{prev_state} = $self->{state};
1407     $self->{state} = ENTITY_STATE;
1408     !!!next-input-character;
1409     redo A;
1410 wakaba 1.25 } elsif ($self->{is_xml} and
1411     $is_space->{$self->{nc}}) {
1412     !!!cp (103.1);
1413     $self->{ca}->{value} .= ' ';
1414     ## Stay in the state.
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.1 } elsif ($self->{nc} == -1) {
1418     !!!parse-error (type => 'unclosed attribute value');
1419     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420     !!!cp (103);
1421     $self->{last_stag_name} = $self->{ct}->{tag_name};
1422 wakaba 1.15
1423     $self->{state} = DATA_STATE;
1424     $self->{s_kwd} = '';
1425     ## reconsume
1426     !!!emit ($self->{ct}); # start tag
1427     redo A;
1428 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430     if ($self->{ct}->{attributes}) {
1431     !!!cp (104);
1432     !!!parse-error (type => 'end tag attribute');
1433     } else {
1434     ## NOTE: This state should never be reached.
1435     !!!cp (105);
1436     }
1437 wakaba 1.15
1438     $self->{state} = DATA_STATE;
1439     $self->{s_kwd} = '';
1440     ## reconsume
1441     !!!emit ($self->{ct}); # end tag
1442     redo A;
1443     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444     ## XML5: No parse error above; not defined yet.
1445     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447     ## Reconsume.
1448     !!!emit ($self->{ct}); # ATTLIST
1449     redo A;
1450 wakaba 1.1 } else {
1451     die "$0: $self->{ct}->{type}: Unknown token type";
1452     }
1453     } else {
1454 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1455 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456     !!!cp (106);
1457     ## XML5: Not a parse error.
1458     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459     } else {
1460     !!!cp (106.1);
1461     }
1462 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1463     $self->{read_until}->($self->{ca}->{value},
1464 wakaba 1.25 qq['&<\x09\x0C\x20],
1465 wakaba 1.1 length $self->{ca}->{value});
1466    
1467     ## Stay in the state
1468     !!!next-input-character;
1469     redo A;
1470     }
1471     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1473    
1474 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1475 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476     !!!cp (107.1);
1477     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479     } else {
1480     !!!cp (107);
1481     ## XML5: "Tag attribute name before state".
1482     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483     }
1484 wakaba 1.1 !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x0026) { # &
1487     !!!cp (108);
1488 wakaba 1.11
1489     ## XML5: Not defined yet.
1490    
1491 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1492     ## "entity in attribute value state". In this implementation, the
1493     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494     ## implementation of the "consume a character reference" algorithm.
1495     $self->{entity_add} = -1;
1496     $self->{prev_state} = $self->{state};
1497     $self->{state} = ENTITY_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x003E) { # >
1501     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502     !!!cp (109);
1503     $self->{last_stag_name} = $self->{ct}->{tag_name};
1504 wakaba 1.15
1505     $self->{state} = DATA_STATE;
1506     $self->{s_kwd} = '';
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # start tag
1509     redo A;
1510 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513     !!!cp (110);
1514     !!!parse-error (type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517     !!!cp (111);
1518     }
1519 wakaba 1.15
1520     $self->{state} = DATA_STATE;
1521     $self->{s_kwd} = '';
1522     !!!next-input-character;
1523     !!!emit ($self->{ct}); # end tag
1524     redo A;
1525     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528     !!!next-input-character;
1529     !!!emit ($self->{ct}); # ATTLIST
1530     redo A;
1531 wakaba 1.1 } else {
1532     die "$0: $self->{ct}->{type}: Unknown token type";
1533     }
1534     } elsif ($self->{nc} == -1) {
1535     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536     !!!cp (112);
1537 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1538 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539 wakaba 1.15
1540     $self->{state} = DATA_STATE;
1541     $self->{s_kwd} = '';
1542     ## reconsume
1543     !!!emit ($self->{ct}); # start tag
1544     redo A;
1545 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548     if ($self->{ct}->{attributes}) {
1549     !!!cp (113);
1550     !!!parse-error (type => 'end tag attribute');
1551     } else {
1552     ## NOTE: This state should never be reached.
1553     !!!cp (114);
1554     }
1555 wakaba 1.15
1556     $self->{state} = DATA_STATE;
1557     $self->{s_kwd} = '';
1558     ## reconsume
1559     !!!emit ($self->{ct}); # end tag
1560     redo A;
1561     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562     !!!parse-error (type => 'unclosed md'); ## TODO: type
1563     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565     ## Reconsume.
1566     !!!emit ($self->{ct}); # ATTLIST
1567     redo A;
1568 wakaba 1.1 } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     } else {
1572     if ({
1573     0x0022 => 1, # "
1574     0x0027 => 1, # '
1575     0x003D => 1, # =
1576 wakaba 1.26 0x003C => 1, # <
1577 wakaba 1.1 }->{$self->{nc}}) {
1578     !!!cp (115);
1579 wakaba 1.11 ## XML5: Not a parse error.
1580 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1581     } else {
1582     !!!cp (116);
1583     }
1584     $self->{ca}->{value} .= chr ($self->{nc});
1585     $self->{read_until}->($self->{ca}->{value},
1586 wakaba 1.25 qq["'=& \x09\x0C>],
1587 wakaba 1.1 length $self->{ca}->{value});
1588    
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     }
1593     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594     if ($is_space->{$self->{nc}}) {
1595     !!!cp (118);
1596     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == 0x003E) { # >
1600     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601     !!!cp (119);
1602     $self->{last_stag_name} = $self->{ct}->{tag_name};
1603     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605     if ($self->{ct}->{attributes}) {
1606     !!!cp (120);
1607     !!!parse-error (type => 'end tag attribute');
1608     } else {
1609     ## NOTE: This state should never be reached.
1610     !!!cp (121);
1611     }
1612     } else {
1613     die "$0: $self->{ct}->{type}: Unknown token type";
1614     }
1615     $self->{state} = DATA_STATE;
1616 wakaba 1.5 $self->{s_kwd} = '';
1617 wakaba 1.1 !!!next-input-character;
1618    
1619     !!!emit ($self->{ct}); # start tag or end tag
1620    
1621     redo A;
1622     } elsif ($self->{nc} == 0x002F) { # /
1623     !!!cp (122);
1624     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625     !!!next-input-character;
1626     redo A;
1627     } elsif ($self->{nc} == -1) {
1628     !!!parse-error (type => 'unclosed tag');
1629     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630     !!!cp (122.3);
1631     $self->{last_stag_name} = $self->{ct}->{tag_name};
1632     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp (122.1);
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     ## NOTE: This state should never be reached.
1638     !!!cp (122.2);
1639     }
1640     } else {
1641     die "$0: $self->{ct}->{type}: Unknown token type";
1642     }
1643     $self->{state} = DATA_STATE;
1644 wakaba 1.5 $self->{s_kwd} = '';
1645 wakaba 1.1 ## Reconsume.
1646     !!!emit ($self->{ct}); # start tag or end tag
1647     redo A;
1648     } else {
1649     !!!cp ('124.1');
1650     !!!parse-error (type => 'no space between attributes');
1651     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652     ## reconsume
1653     redo A;
1654     }
1655     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 wakaba 1.11 ## XML5: "Empty tag state".
1657    
1658 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1659     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     !!!cp ('124.2');
1661     !!!parse-error (type => 'nestc', token => $self->{ct});
1662     ## TODO: Different type than slash in start tag
1663     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664     if ($self->{ct}->{attributes}) {
1665     !!!cp ('124.4');
1666     !!!parse-error (type => 'end tag attribute');
1667     } else {
1668     !!!cp ('124.5');
1669     }
1670     ## TODO: Test |<title></title/>|
1671     } else {
1672     !!!cp ('124.3');
1673     $self->{self_closing} = 1;
1674     }
1675    
1676     $self->{state} = DATA_STATE;
1677 wakaba 1.5 $self->{s_kwd} = '';
1678 wakaba 1.1 !!!next-input-character;
1679    
1680     !!!emit ($self->{ct}); # start tag or end tag
1681    
1682     redo A;
1683     } elsif ($self->{nc} == -1) {
1684     !!!parse-error (type => 'unclosed tag');
1685     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686     !!!cp (124.7);
1687     $self->{last_stag_name} = $self->{ct}->{tag_name};
1688     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689     if ($self->{ct}->{attributes}) {
1690     !!!cp (124.5);
1691     !!!parse-error (type => 'end tag attribute');
1692     } else {
1693     ## NOTE: This state should never be reached.
1694     !!!cp (124.6);
1695     }
1696     } else {
1697     die "$0: $self->{ct}->{type}: Unknown token type";
1698     }
1699 wakaba 1.11 ## XML5: "Tag attribute name before state".
1700 wakaba 1.1 $self->{state} = DATA_STATE;
1701 wakaba 1.5 $self->{s_kwd} = '';
1702 wakaba 1.1 ## Reconsume.
1703     !!!emit ($self->{ct}); # start tag or end tag
1704     redo A;
1705     } else {
1706     !!!cp ('124.4');
1707     !!!parse-error (type => 'nestc');
1708     ## TODO: This error type is wrong.
1709     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710     ## Reconsume.
1711     redo A;
1712     }
1713     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715    
1716 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717     ## consumes characters one-by-one basis.
1718    
1719     if ($self->{nc} == 0x003E) { # >
1720 wakaba 1.13 if ($self->{in_subset}) {
1721     !!!cp (123);
1722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723     } else {
1724     !!!cp (124);
1725     $self->{state} = DATA_STATE;
1726     $self->{s_kwd} = '';
1727     }
1728 wakaba 1.1 !!!next-input-character;
1729    
1730     !!!emit ($self->{ct}); # comment
1731     redo A;
1732     } elsif ($self->{nc} == -1) {
1733 wakaba 1.13 if ($self->{in_subset}) {
1734     !!!cp (125.1);
1735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736     } else {
1737     !!!cp (125);
1738     $self->{state} = DATA_STATE;
1739     $self->{s_kwd} = '';
1740     }
1741 wakaba 1.1 ## reconsume
1742    
1743     !!!emit ($self->{ct}); # comment
1744     redo A;
1745     } else {
1746     !!!cp (126);
1747     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748     $self->{read_until}->($self->{ct}->{data},
1749     q[>],
1750     length $self->{ct}->{data});
1751    
1752     ## Stay in the state.
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 wakaba 1.14 ## XML5: "Markup declaration state".
1758 wakaba 1.1
1759     if ($self->{nc} == 0x002D) { # -
1760     !!!cp (133);
1761     $self->{state} = MD_HYPHEN_STATE;
1762     !!!next-input-character;
1763     redo A;
1764     } elsif ($self->{nc} == 0x0044 or # D
1765     $self->{nc} == 0x0064) { # d
1766     ## ASCII case-insensitive.
1767     !!!cp (130);
1768     $self->{state} = MD_DOCTYPE_STATE;
1769 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1770 wakaba 1.1 !!!next-input-character;
1771     redo A;
1772 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774     $self->{is_xml}) and
1775 wakaba 1.1 $self->{nc} == 0x005B) { # [
1776     !!!cp (135.4);
1777     $self->{state} = MD_CDATA_STATE;
1778 wakaba 1.12 $self->{kwd} = '[';
1779 wakaba 1.1 !!!next-input-character;
1780     redo A;
1781     } else {
1782     !!!cp (136);
1783     }
1784    
1785     !!!parse-error (type => 'bogus comment',
1786     line => $self->{line_prev},
1787     column => $self->{column_prev} - 1);
1788     ## Reconsume.
1789     $self->{state} = BOGUS_COMMENT_STATE;
1790     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791     line => $self->{line_prev},
1792     column => $self->{column_prev} - 1,
1793     };
1794     redo A;
1795     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796     if ($self->{nc} == 0x002D) { # -
1797     !!!cp (127);
1798     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 2,
1801     };
1802 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 wakaba 1.1 !!!next-input-character;
1804     redo A;
1805     } else {
1806     !!!cp (128);
1807     !!!parse-error (type => 'bogus comment',
1808     line => $self->{line_prev},
1809     column => $self->{column_prev} - 2);
1810     $self->{state} = BOGUS_COMMENT_STATE;
1811     ## Reconsume.
1812     $self->{ct} = {type => COMMENT_TOKEN,
1813     data => '-',
1814     line => $self->{line_prev},
1815     column => $self->{column_prev} - 2,
1816     };
1817     redo A;
1818     }
1819     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820     ## ASCII case-insensitive.
1821     if ($self->{nc} == [
1822     undef,
1823     0x004F, # O
1824     0x0043, # C
1825     0x0054, # T
1826     0x0059, # Y
1827     0x0050, # P
1828 wakaba 1.12 ]->[length $self->{kwd}] or
1829 wakaba 1.1 $self->{nc} == [
1830     undef,
1831     0x006F, # o
1832     0x0063, # c
1833     0x0074, # t
1834     0x0079, # y
1835     0x0070, # p
1836 wakaba 1.12 ]->[length $self->{kwd}]) {
1837 wakaba 1.1 !!!cp (131);
1838     ## Stay in the state.
1839 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1840 wakaba 1.1 !!!next-input-character;
1841     redo A;
1842 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1843 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1844     $self->{nc} == 0x0065)) { # e
1845 wakaba 1.12 if ($self->{is_xml} and
1846     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 wakaba 1.10 !!!cp (129);
1848     ## XML5: case-sensitive.
1849     !!!parse-error (type => 'lowercase keyword', ## TODO
1850     text => 'DOCTYPE',
1851     line => $self->{line_prev},
1852     column => $self->{column_prev} - 5);
1853     } else {
1854     !!!cp (129.1);
1855     }
1856 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1857     $self->{ct} = {type => DOCTYPE_TOKEN,
1858     quirks => 1,
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 7,
1861     };
1862     !!!next-input-character;
1863     redo A;
1864     } else {
1865     !!!cp (132);
1866     !!!parse-error (type => 'bogus comment',
1867     line => $self->{line_prev},
1868 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1870     ## Reconsume.
1871     $self->{ct} = {type => COMMENT_TOKEN,
1872 wakaba 1.12 data => $self->{kwd},
1873 wakaba 1.1 line => $self->{line_prev},
1874 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 wakaba 1.1 };
1876     redo A;
1877     }
1878     } elsif ($self->{state} == MD_CDATA_STATE) {
1879     if ($self->{nc} == {
1880     '[' => 0x0043, # C
1881     '[C' => 0x0044, # D
1882     '[CD' => 0x0041, # A
1883     '[CDA' => 0x0054, # T
1884     '[CDAT' => 0x0041, # A
1885 wakaba 1.12 }->{$self->{kwd}}) {
1886 wakaba 1.1 !!!cp (135.1);
1887     ## Stay in the state.
1888 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1889 wakaba 1.1 !!!next-input-character;
1890     redo A;
1891 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1892 wakaba 1.1 $self->{nc} == 0x005B) { # [
1893 wakaba 1.6 if ($self->{is_xml} and
1894     not $self->{tainted} and
1895     @{$self->{open_elements} or []} == 0) {
1896 wakaba 1.8 !!!cp (135.2);
1897 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1898     line => $self->{line_prev},
1899     column => $self->{column_prev} - 7);
1900     $self->{tainted} = 1;
1901 wakaba 1.8 } else {
1902     !!!cp (135.21);
1903 wakaba 1.6 }
1904    
1905 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1906     data => '',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev} - 7};
1909     $self->{state} = CDATA_SECTION_STATE;
1910     !!!next-input-character;
1911     redo A;
1912     } else {
1913     !!!cp (135.3);
1914     !!!parse-error (type => 'bogus comment',
1915     line => $self->{line_prev},
1916 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1918     ## Reconsume.
1919     $self->{ct} = {type => COMMENT_TOKEN,
1920 wakaba 1.12 data => $self->{kwd},
1921 wakaba 1.1 line => $self->{line_prev},
1922 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 wakaba 1.1 };
1924     redo A;
1925     }
1926     } elsif ($self->{state} == COMMENT_START_STATE) {
1927     if ($self->{nc} == 0x002D) { # -
1928     !!!cp (137);
1929     $self->{state} = COMMENT_START_DASH_STATE;
1930     !!!next-input-character;
1931     redo A;
1932     } elsif ($self->{nc} == 0x003E) { # >
1933     !!!parse-error (type => 'bogus comment');
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (138.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (138);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 !!!next-input-character;
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } elsif ($self->{nc} == -1) {
1948     !!!parse-error (type => 'unclosed comment');
1949 wakaba 1.13 if ($self->{in_subset}) {
1950     !!!cp (139.1);
1951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952     } else {
1953     !!!cp (139);
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     }
1957 wakaba 1.1 ## reconsume
1958    
1959     !!!emit ($self->{ct}); # comment
1960    
1961     redo A;
1962     } else {
1963     !!!cp (140);
1964     $self->{ct}->{data} # comment
1965     .= chr ($self->{nc});
1966     $self->{state} = COMMENT_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     }
1970     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971     if ($self->{nc} == 0x002D) { # -
1972     !!!cp (141);
1973     $self->{state} = COMMENT_END_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} == 0x003E) { # >
1977     !!!parse-error (type => 'bogus comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (142.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (142);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 !!!next-input-character;
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (143.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (143);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (144);
2008     $self->{ct}->{data} # comment
2009     .= '-' . chr ($self->{nc});
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == COMMENT_STATE) {
2015 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2016    
2017 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2018     !!!cp (145);
2019     $self->{state} = COMMENT_END_DASH_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     } elsif ($self->{nc} == -1) {
2023     !!!parse-error (type => 'unclosed comment');
2024 wakaba 1.13 if ($self->{in_subset}) {
2025     !!!cp (146.1);
2026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027     } else {
2028     !!!cp (146);
2029     $self->{state} = DATA_STATE;
2030     $self->{s_kwd} = '';
2031     }
2032 wakaba 1.1 ## reconsume
2033    
2034     !!!emit ($self->{ct}); # comment
2035    
2036     redo A;
2037     } else {
2038     !!!cp (147);
2039     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040     $self->{read_until}->($self->{ct}->{data},
2041     q[-],
2042     length $self->{ct}->{data});
2043    
2044     ## Stay in the state
2045     !!!next-input-character;
2046     redo A;
2047     }
2048     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050 wakaba 1.10
2051 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2052     !!!cp (148);
2053     $self->{state} = COMMENT_END_STATE;
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == -1) {
2057     !!!parse-error (type => 'unclosed comment');
2058 wakaba 1.13 if ($self->{in_subset}) {
2059     !!!cp (149.1);
2060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061     } else {
2062     !!!cp (149);
2063     $self->{state} = DATA_STATE;
2064     $self->{s_kwd} = '';
2065     }
2066 wakaba 1.1 ## reconsume
2067    
2068     !!!emit ($self->{ct}); # comment
2069    
2070     redo A;
2071     } else {
2072     !!!cp (150);
2073     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074     $self->{state} = COMMENT_STATE;
2075     !!!next-input-character;
2076     redo A;
2077     }
2078     } elsif ($self->{state} == COMMENT_END_STATE) {
2079 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080    
2081 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2082 wakaba 1.13 if ($self->{in_subset}) {
2083     !!!cp (151.1);
2084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085     } else {
2086     !!!cp (151);
2087     $self->{state} = DATA_STATE;
2088     $self->{s_kwd} = '';
2089     }
2090 wakaba 1.1 !!!next-input-character;
2091    
2092     !!!emit ($self->{ct}); # comment
2093    
2094     redo A;
2095     } elsif ($self->{nc} == 0x002D) { # -
2096     !!!cp (152);
2097 wakaba 1.10 ## XML5: Not a parse error.
2098 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev});
2101     $self->{ct}->{data} .= '-'; # comment
2102     ## Stay in the state
2103     !!!next-input-character;
2104     redo A;
2105     } elsif ($self->{nc} == -1) {
2106     !!!parse-error (type => 'unclosed comment');
2107 wakaba 1.13 if ($self->{in_subset}) {
2108     !!!cp (153.1);
2109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110     } else {
2111     !!!cp (153);
2112     $self->{state} = DATA_STATE;
2113     $self->{s_kwd} = '';
2114     }
2115 wakaba 1.1 ## reconsume
2116    
2117     !!!emit ($self->{ct}); # comment
2118    
2119     redo A;
2120     } else {
2121     !!!cp (154);
2122 wakaba 1.10 ## XML5: Not a parse error.
2123 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2124     line => $self->{line_prev},
2125     column => $self->{column_prev});
2126     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2127     $self->{state} = COMMENT_STATE;
2128     !!!next-input-character;
2129     redo A;
2130     }
2131     } elsif ($self->{state} == DOCTYPE_STATE) {
2132     if ($is_space->{$self->{nc}}) {
2133     !!!cp (155);
2134     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2135     !!!next-input-character;
2136     redo A;
2137     } else {
2138     !!!cp (156);
2139 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2140 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2141     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2142     ## reconsume
2143     redo A;
2144     }
2145     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2146 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2147    
2148 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2149     !!!cp (157);
2150     ## Stay in the state
2151     !!!next-input-character;
2152     redo A;
2153     } elsif ($self->{nc} == 0x003E) { # >
2154     !!!cp (158);
2155 wakaba 1.12 ## XML5: No parse error.
2156 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2157     $self->{state} = DATA_STATE;
2158 wakaba 1.5 $self->{s_kwd} = '';
2159 wakaba 1.1 !!!next-input-character;
2160    
2161     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2162    
2163     redo A;
2164     } elsif ($self->{nc} == -1) {
2165     !!!cp (159);
2166     !!!parse-error (type => 'no DOCTYPE name');
2167     $self->{state} = DATA_STATE;
2168 wakaba 1.5 $self->{s_kwd} = '';
2169 wakaba 1.1 ## reconsume
2170    
2171     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2172    
2173     redo A;
2174 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2175     !!!cp (159.1);
2176     !!!parse-error (type => 'no DOCTYPE name');
2177     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2178 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2179     $self->{in_subset} = 1;
2180 wakaba 1.12 !!!next-input-character;
2181 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2182 wakaba 1.12 redo A;
2183 wakaba 1.1 } else {
2184     !!!cp (160);
2185     $self->{ct}->{name} = chr $self->{nc};
2186     delete $self->{ct}->{quirks};
2187     $self->{state} = DOCTYPE_NAME_STATE;
2188     !!!next-input-character;
2189     redo A;
2190     }
2191     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2192 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2193    
2194     ## ISSUE: Redundant "First," in the spec.
2195    
2196 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2197     !!!cp (161);
2198     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2199     !!!next-input-character;
2200     redo A;
2201     } elsif ($self->{nc} == 0x003E) { # >
2202     !!!cp (162);
2203     $self->{state} = DATA_STATE;
2204 wakaba 1.5 $self->{s_kwd} = '';
2205 wakaba 1.1 !!!next-input-character;
2206    
2207     !!!emit ($self->{ct}); # DOCTYPE
2208    
2209     redo A;
2210     } elsif ($self->{nc} == -1) {
2211     !!!cp (163);
2212     !!!parse-error (type => 'unclosed DOCTYPE');
2213     $self->{state} = DATA_STATE;
2214 wakaba 1.5 $self->{s_kwd} = '';
2215 wakaba 1.1 ## reconsume
2216    
2217     $self->{ct}->{quirks} = 1;
2218     !!!emit ($self->{ct}); # DOCTYPE
2219    
2220     redo A;
2221 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2222     !!!cp (163.1);
2223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2224 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2225     $self->{in_subset} = 1;
2226 wakaba 1.12 !!!next-input-character;
2227 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2228 wakaba 1.12 redo A;
2229 wakaba 1.1 } else {
2230     !!!cp (164);
2231     $self->{ct}->{name}
2232     .= chr ($self->{nc}); # DOCTYPE
2233     ## Stay in the state
2234     !!!next-input-character;
2235     redo A;
2236     }
2237     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2238 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2239     ## state", but implemented differently.
2240    
2241 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2242     !!!cp (165);
2243     ## Stay in the state
2244     !!!next-input-character;
2245     redo A;
2246     } elsif ($self->{nc} == 0x003E) { # >
2247 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2248     !!!cp (166);
2249     $self->{state} = DATA_STATE;
2250     $self->{s_kwd} = '';
2251     } else {
2252     !!!cp (166.1);
2253     !!!parse-error (type => 'no md def'); ## TODO: type
2254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2255     }
2256    
2257 wakaba 1.1 !!!next-input-character;
2258 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2259 wakaba 1.1 redo A;
2260     } elsif ($self->{nc} == -1) {
2261 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2262     !!!cp (167);
2263     !!!parse-error (type => 'unclosed DOCTYPE');
2264     $self->{state} = DATA_STATE;
2265     $self->{s_kwd} = '';
2266     $self->{ct}->{quirks} = 1;
2267     } else {
2268     !!!cp (167.12);
2269     !!!parse-error (type => 'unclosed md'); ## TODO: type
2270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2271     }
2272    
2273     ## Reconsume.
2274     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2275 wakaba 1.1 redo A;
2276     } elsif ($self->{nc} == 0x0050 or # P
2277     $self->{nc} == 0x0070) { # p
2278 wakaba 1.12 !!!cp (167.1);
2279 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2280 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2281 wakaba 1.1 !!!next-input-character;
2282     redo A;
2283     } elsif ($self->{nc} == 0x0053 or # S
2284     $self->{nc} == 0x0073) { # s
2285 wakaba 1.12 !!!cp (167.2);
2286 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2287 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2288     !!!next-input-character;
2289     redo A;
2290 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2291     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2292     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2293     !!!cp (167.21);
2294     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2295     $self->{ct}->{value} = ''; # ENTITY
2296     !!!next-input-character;
2297     redo A;
2298     } elsif ($self->{nc} == 0x0027 and # '
2299     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2300     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2301     !!!cp (167.22);
2302     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2303     $self->{ct}->{value} = ''; # ENTITY
2304     !!!next-input-character;
2305     redo A;
2306 wakaba 1.16 } elsif ($self->{is_xml} and
2307     $self->{ct}->{type} == DOCTYPE_TOKEN and
2308     $self->{nc} == 0x005B) { # [
2309 wakaba 1.12 !!!cp (167.3);
2310     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2311     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2312 wakaba 1.13 $self->{in_subset} = 1;
2313 wakaba 1.1 !!!next-input-character;
2314 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2315 wakaba 1.1 redo A;
2316     } else {
2317 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2318    
2319     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2320     !!!cp (180);
2321     $self->{ct}->{quirks} = 1;
2322     $self->{state} = BOGUS_DOCTYPE_STATE;
2323     } else {
2324     !!!cp (180.1);
2325     $self->{state} = BOGUS_MD_STATE;
2326     }
2327 wakaba 1.1
2328     !!!next-input-character;
2329     redo A;
2330     }
2331     } elsif ($self->{state} == PUBLIC_STATE) {
2332     ## ASCII case-insensitive
2333     if ($self->{nc} == [
2334     undef,
2335     0x0055, # U
2336     0x0042, # B
2337     0x004C, # L
2338     0x0049, # I
2339 wakaba 1.12 ]->[length $self->{kwd}] or
2340 wakaba 1.1 $self->{nc} == [
2341     undef,
2342     0x0075, # u
2343     0x0062, # b
2344     0x006C, # l
2345     0x0069, # i
2346 wakaba 1.12 ]->[length $self->{kwd}]) {
2347 wakaba 1.1 !!!cp (175);
2348     ## Stay in the state.
2349 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2350 wakaba 1.1 !!!next-input-character;
2351     redo A;
2352 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2353 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2354     $self->{nc} == 0x0063)) { # c
2355 wakaba 1.12 if ($self->{is_xml} and
2356     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2357     !!!cp (168.1);
2358     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2359     text => 'PUBLIC',
2360     line => $self->{line_prev},
2361     column => $self->{column_prev} - 4);
2362     } else {
2363     !!!cp (168);
2364     }
2365 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2366     !!!next-input-character;
2367     redo A;
2368     } else {
2369 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2370 wakaba 1.1 line => $self->{line_prev},
2371 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2372 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2373     !!!cp (169);
2374     $self->{ct}->{quirks} = 1;
2375     $self->{state} = BOGUS_DOCTYPE_STATE;
2376     } else {
2377     !!!cp (169.1);
2378     $self->{state} = BOGUS_MD_STATE;
2379     }
2380 wakaba 1.1 ## Reconsume.
2381     redo A;
2382     }
2383     } elsif ($self->{state} == SYSTEM_STATE) {
2384     ## ASCII case-insensitive
2385     if ($self->{nc} == [
2386     undef,
2387     0x0059, # Y
2388     0x0053, # S
2389     0x0054, # T
2390     0x0045, # E
2391 wakaba 1.12 ]->[length $self->{kwd}] or
2392 wakaba 1.1 $self->{nc} == [
2393     undef,
2394     0x0079, # y
2395     0x0073, # s
2396     0x0074, # t
2397     0x0065, # e
2398 wakaba 1.12 ]->[length $self->{kwd}]) {
2399 wakaba 1.1 !!!cp (170);
2400     ## Stay in the state.
2401 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2402 wakaba 1.1 !!!next-input-character;
2403     redo A;
2404 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2405 wakaba 1.1 ($self->{nc} == 0x004D or # M
2406     $self->{nc} == 0x006D)) { # m
2407 wakaba 1.12 if ($self->{is_xml} and
2408     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2409     !!!cp (171.1);
2410     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2411     text => 'SYSTEM',
2412     line => $self->{line_prev},
2413     column => $self->{column_prev} - 4);
2414     } else {
2415     !!!cp (171);
2416     }
2417 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2418     !!!next-input-character;
2419     redo A;
2420     } else {
2421 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2422 wakaba 1.1 line => $self->{line_prev},
2423 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2424 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2425     !!!cp (172);
2426     $self->{ct}->{quirks} = 1;
2427     $self->{state} = BOGUS_DOCTYPE_STATE;
2428     } else {
2429     !!!cp (172.1);
2430     $self->{state} = BOGUS_MD_STATE;
2431     }
2432 wakaba 1.1 ## Reconsume.
2433     redo A;
2434     }
2435     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2436     if ($is_space->{$self->{nc}}) {
2437     !!!cp (181);
2438     ## Stay in the state
2439     !!!next-input-character;
2440     redo A;
2441     } elsif ($self->{nc} eq 0x0022) { # "
2442     !!!cp (182);
2443     $self->{ct}->{pubid} = ''; # DOCTYPE
2444     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2445     !!!next-input-character;
2446     redo A;
2447     } elsif ($self->{nc} eq 0x0027) { # '
2448     !!!cp (183);
2449     $self->{ct}->{pubid} = ''; # DOCTYPE
2450     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2451     !!!next-input-character;
2452     redo A;
2453     } elsif ($self->{nc} eq 0x003E) { # >
2454     !!!parse-error (type => 'no PUBLIC literal');
2455 wakaba 1.16
2456     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2457     !!!cp (184);
2458     $self->{state} = DATA_STATE;
2459     $self->{s_kwd} = '';
2460     $self->{ct}->{quirks} = 1;
2461     } else {
2462     !!!cp (184.1);
2463     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2464     }
2465    
2466 wakaba 1.1 !!!next-input-character;
2467 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2468 wakaba 1.1 redo A;
2469     } elsif ($self->{nc} == -1) {
2470 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2471     !!!cp (185);
2472     !!!parse-error (type => 'unclosed DOCTYPE');
2473     $self->{state} = DATA_STATE;
2474     $self->{s_kwd} = '';
2475     $self->{ct}->{quirks} = 1;
2476     } else {
2477     !!!cp (185.1);
2478     !!!parse-error (type => 'unclosed md'); ## TODO: type
2479     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2480     }
2481    
2482 wakaba 1.1 ## reconsume
2483     !!!emit ($self->{ct}); # DOCTYPE
2484     redo A;
2485 wakaba 1.16 } elsif ($self->{is_xml} and
2486     $self->{ct}->{type} == DOCTYPE_TOKEN and
2487     $self->{nc} == 0x005B) { # [
2488 wakaba 1.12 !!!cp (186.1);
2489     !!!parse-error (type => 'no PUBLIC literal');
2490     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2491     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2492 wakaba 1.13 $self->{in_subset} = 1;
2493 wakaba 1.12 !!!next-input-character;
2494 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2495 wakaba 1.12 redo A;
2496 wakaba 1.1 } else {
2497     !!!parse-error (type => 'string after PUBLIC');
2498    
2499 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2500     !!!cp (186);
2501     $self->{ct}->{quirks} = 1;
2502     $self->{state} = BOGUS_DOCTYPE_STATE;
2503     } else {
2504     !!!cp (186.2);
2505     $self->{state} = BOGUS_MD_STATE;
2506     }
2507    
2508 wakaba 1.1 !!!next-input-character;
2509     redo A;
2510     }
2511     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2512     if ($self->{nc} == 0x0022) { # "
2513     !!!cp (187);
2514     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2515     !!!next-input-character;
2516     redo A;
2517     } elsif ($self->{nc} == 0x003E) { # >
2518     !!!parse-error (type => 'unclosed PUBLIC literal');
2519    
2520 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2521     !!!cp (188);
2522     $self->{state} = DATA_STATE;
2523     $self->{s_kwd} = '';
2524     $self->{ct}->{quirks} = 1;
2525     } else {
2526     !!!cp (188.1);
2527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2528     }
2529    
2530 wakaba 1.1 !!!next-input-character;
2531 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2532 wakaba 1.1 redo A;
2533     } elsif ($self->{nc} == -1) {
2534     !!!parse-error (type => 'unclosed PUBLIC literal');
2535    
2536 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2537     !!!cp (189);
2538     $self->{state} = DATA_STATE;
2539     $self->{s_kwd} = '';
2540     $self->{ct}->{quirks} = 1;
2541     } else {
2542     !!!cp (189.1);
2543     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2544     }
2545    
2546     ## Reconsume.
2547 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2548     redo A;
2549     } else {
2550     !!!cp (190);
2551 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2552 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2553     length $self->{ct}->{pubid});
2554    
2555     ## Stay in the state
2556     !!!next-input-character;
2557     redo A;
2558     }
2559     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2560     if ($self->{nc} == 0x0027) { # '
2561     !!!cp (191);
2562     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2563     !!!next-input-character;
2564     redo A;
2565     } elsif ($self->{nc} == 0x003E) { # >
2566     !!!parse-error (type => 'unclosed PUBLIC literal');
2567    
2568 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2569     !!!cp (192);
2570     $self->{state} = DATA_STATE;
2571     $self->{s_kwd} = '';
2572     $self->{ct}->{quirks} = 1;
2573     } else {
2574     !!!cp (192.1);
2575     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2576     }
2577    
2578 wakaba 1.1 !!!next-input-character;
2579 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2580 wakaba 1.1 redo A;
2581     } elsif ($self->{nc} == -1) {
2582     !!!parse-error (type => 'unclosed PUBLIC literal');
2583    
2584 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2585     !!!cp (193);
2586     $self->{state} = DATA_STATE;
2587     $self->{s_kwd} = '';
2588     $self->{ct}->{quirks} = 1;
2589     } else {
2590     !!!cp (193.1);
2591     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2592     }
2593    
2594 wakaba 1.1 ## reconsume
2595 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2596 wakaba 1.1 redo A;
2597     } else {
2598     !!!cp (194);
2599 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2600 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2601     length $self->{ct}->{pubid});
2602    
2603     ## Stay in the state
2604     !!!next-input-character;
2605     redo A;
2606     }
2607     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2608     if ($is_space->{$self->{nc}}) {
2609     !!!cp (195);
2610     ## Stay in the state
2611     !!!next-input-character;
2612     redo A;
2613     } elsif ($self->{nc} == 0x0022) { # "
2614     !!!cp (196);
2615 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2616 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2617     !!!next-input-character;
2618     redo A;
2619     } elsif ($self->{nc} == 0x0027) { # '
2620     !!!cp (197);
2621 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2622 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2623     !!!next-input-character;
2624     redo A;
2625     } elsif ($self->{nc} == 0x003E) { # >
2626 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2627     if ($self->{is_xml}) {
2628     !!!cp (198.1);
2629     !!!parse-error (type => 'no SYSTEM literal');
2630     } else {
2631     !!!cp (198);
2632     }
2633     $self->{state} = DATA_STATE;
2634     $self->{s_kwd} = '';
2635 wakaba 1.12 } else {
2636 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2637     !!!cp (198.2);
2638     } else {
2639     !!!cp (198.3);
2640     !!!parse-error (type => 'no SYSTEM literal');
2641     }
2642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2643 wakaba 1.12 }
2644 wakaba 1.16
2645 wakaba 1.1 !!!next-input-character;
2646 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2647 wakaba 1.1 redo A;
2648     } elsif ($self->{nc} == -1) {
2649 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2650     !!!cp (199);
2651     !!!parse-error (type => 'unclosed DOCTYPE');
2652    
2653     $self->{state} = DATA_STATE;
2654     $self->{s_kwd} = '';
2655     $self->{ct}->{quirks} = 1;
2656     } else {
2657     !!!parse-error (type => 'unclosed md'); ## TODO: type
2658     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2659     }
2660    
2661 wakaba 1.1 ## reconsume
2662 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2663 wakaba 1.1 redo A;
2664 wakaba 1.16 } elsif ($self->{is_xml} and
2665     $self->{ct}->{type} == DOCTYPE_TOKEN and
2666     $self->{nc} == 0x005B) { # [
2667 wakaba 1.12 !!!cp (200.1);
2668     !!!parse-error (type => 'no SYSTEM literal');
2669     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2670     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2671 wakaba 1.13 $self->{in_subset} = 1;
2672 wakaba 1.12 !!!next-input-character;
2673 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2674 wakaba 1.12 redo A;
2675 wakaba 1.1 } else {
2676     !!!parse-error (type => 'string after PUBLIC literal');
2677    
2678 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2679     !!!cp (200);
2680     $self->{ct}->{quirks} = 1;
2681     $self->{state} = BOGUS_DOCTYPE_STATE;
2682     } else {
2683     !!!cp (200.2);
2684     $self->{state} = BOGUS_MD_STATE;
2685     }
2686    
2687 wakaba 1.1 !!!next-input-character;
2688     redo A;
2689     }
2690     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2691     if ($is_space->{$self->{nc}}) {
2692     !!!cp (201);
2693     ## Stay in the state
2694     !!!next-input-character;
2695     redo A;
2696     } elsif ($self->{nc} == 0x0022) { # "
2697     !!!cp (202);
2698     $self->{ct}->{sysid} = ''; # DOCTYPE
2699     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2700     !!!next-input-character;
2701     redo A;
2702     } elsif ($self->{nc} == 0x0027) { # '
2703     !!!cp (203);
2704     $self->{ct}->{sysid} = ''; # DOCTYPE
2705     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2706     !!!next-input-character;
2707     redo A;
2708     } elsif ($self->{nc} == 0x003E) { # >
2709     !!!parse-error (type => 'no SYSTEM literal');
2710     !!!next-input-character;
2711    
2712 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2713     !!!cp (204);
2714     $self->{state} = DATA_STATE;
2715     $self->{s_kwd} = '';
2716     $self->{ct}->{quirks} = 1;
2717     } else {
2718     !!!cp (204.1);
2719     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2720     }
2721 wakaba 1.1
2722 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2723 wakaba 1.1 redo A;
2724     } elsif ($self->{nc} == -1) {
2725 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2726     !!!cp (205);
2727     !!!parse-error (type => 'unclosed DOCTYPE');
2728     $self->{state} = DATA_STATE;
2729     $self->{s_kwd} = '';
2730     $self->{ct}->{quirks} = 1;
2731     } else {
2732     !!!cp (205.1);
2733     !!!parse-error (type => 'unclosed md'); ## TODO: type
2734     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2735     }
2736    
2737 wakaba 1.1 ## reconsume
2738 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2739 wakaba 1.1 redo A;
2740 wakaba 1.16 } elsif ($self->{is_xml} and
2741     $self->{ct}->{type} == DOCTYPE_TOKEN and
2742     $self->{nc} == 0x005B) { # [
2743 wakaba 1.12 !!!cp (206.1);
2744     !!!parse-error (type => 'no SYSTEM literal');
2745    
2746     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2747     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2748 wakaba 1.13 $self->{in_subset} = 1;
2749 wakaba 1.12 !!!next-input-character;
2750 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2751 wakaba 1.12 redo A;
2752 wakaba 1.1 } else {
2753     !!!parse-error (type => 'string after SYSTEM');
2754    
2755 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2756     !!!cp (206);
2757     $self->{ct}->{quirks} = 1;
2758     $self->{state} = BOGUS_DOCTYPE_STATE;
2759     } else {
2760     !!!cp (206.2);
2761     $self->{state} = BOGUS_MD_STATE;
2762     }
2763    
2764 wakaba 1.1 !!!next-input-character;
2765     redo A;
2766     }
2767     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2768     if ($self->{nc} == 0x0022) { # "
2769     !!!cp (207);
2770     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2771     !!!next-input-character;
2772     redo A;
2773 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2774 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2775    
2776 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2777     !!!cp (208);
2778     $self->{state} = DATA_STATE;
2779     $self->{s_kwd} = '';
2780     $self->{ct}->{quirks} = 1;
2781     } else {
2782     !!!cp (208.1);
2783     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2784     }
2785    
2786 wakaba 1.1 !!!next-input-character;
2787 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2788 wakaba 1.1 redo A;
2789     } elsif ($self->{nc} == -1) {
2790     !!!parse-error (type => 'unclosed SYSTEM literal');
2791    
2792 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2793     !!!cp (209);
2794     $self->{state} = DATA_STATE;
2795     $self->{s_kwd} = '';
2796     $self->{ct}->{quirks} = 1;
2797     } else {
2798     !!!cp (209.1);
2799     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2800     }
2801    
2802 wakaba 1.1 ## reconsume
2803 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2804 wakaba 1.1 redo A;
2805     } else {
2806     !!!cp (210);
2807 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2808 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2809     length $self->{ct}->{sysid});
2810    
2811     ## Stay in the state
2812     !!!next-input-character;
2813     redo A;
2814     }
2815     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2816     if ($self->{nc} == 0x0027) { # '
2817     !!!cp (211);
2818     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2819     !!!next-input-character;
2820     redo A;
2821 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2822 wakaba 1.1 !!!cp (212);
2823     !!!parse-error (type => 'unclosed SYSTEM literal');
2824    
2825     $self->{state} = DATA_STATE;
2826 wakaba 1.5 $self->{s_kwd} = '';
2827 wakaba 1.1 !!!next-input-character;
2828    
2829     $self->{ct}->{quirks} = 1;
2830     !!!emit ($self->{ct}); # DOCTYPE
2831    
2832     redo A;
2833     } elsif ($self->{nc} == -1) {
2834     !!!parse-error (type => 'unclosed SYSTEM literal');
2835    
2836 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2837     !!!cp (213);
2838     $self->{state} = DATA_STATE;
2839     $self->{s_kwd} = '';
2840     $self->{ct}->{quirks} = 1;
2841     } else {
2842     !!!cp (213.1);
2843     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2844     }
2845    
2846 wakaba 1.1 ## reconsume
2847 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2848 wakaba 1.1 redo A;
2849     } else {
2850     !!!cp (214);
2851 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2852 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2853     length $self->{ct}->{sysid});
2854    
2855     ## Stay in the state
2856     !!!next-input-character;
2857     redo A;
2858     }
2859     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2860     if ($is_space->{$self->{nc}}) {
2861 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2862     !!!cp (215.1);
2863     $self->{state} = BEFORE_NDATA_STATE;
2864     } else {
2865     !!!cp (215);
2866     ## Stay in the state
2867     }
2868 wakaba 1.1 !!!next-input-character;
2869     redo A;
2870     } elsif ($self->{nc} == 0x003E) { # >
2871 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872     !!!cp (216);
2873     $self->{state} = DATA_STATE;
2874     $self->{s_kwd} = '';
2875     } else {
2876     !!!cp (216.1);
2877     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2878     }
2879    
2880 wakaba 1.1 !!!next-input-character;
2881 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2882 wakaba 1.1 redo A;
2883 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2884     ($self->{nc} == 0x004E or # N
2885     $self->{nc} == 0x006E)) { # n
2886     !!!cp (216.2);
2887     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2888     $self->{state} = NDATA_STATE;
2889     $self->{kwd} = chr $self->{nc};
2890     !!!next-input-character;
2891     redo A;
2892 wakaba 1.1 } elsif ($self->{nc} == -1) {
2893 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2894     !!!cp (217);
2895     !!!parse-error (type => 'unclosed DOCTYPE');
2896     $self->{state} = DATA_STATE;
2897     $self->{s_kwd} = '';
2898     $self->{ct}->{quirks} = 1;
2899     } else {
2900     !!!cp (217.1);
2901     !!!parse-error (type => 'unclosed md'); ## TODO: type
2902     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2903     }
2904    
2905 wakaba 1.1 ## reconsume
2906 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2907 wakaba 1.1 redo A;
2908 wakaba 1.16 } elsif ($self->{is_xml} and
2909     $self->{ct}->{type} == DOCTYPE_TOKEN and
2910     $self->{nc} == 0x005B) { # [
2911 wakaba 1.12 !!!cp (218.1);
2912     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2913     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2914 wakaba 1.13 $self->{in_subset} = 1;
2915 wakaba 1.12 !!!next-input-character;
2916 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2917 wakaba 1.12 redo A;
2918 wakaba 1.1 } else {
2919     !!!parse-error (type => 'string after SYSTEM literal');
2920    
2921 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2922     !!!cp (218);
2923     #$self->{ct}->{quirks} = 1;
2924     $self->{state} = BOGUS_DOCTYPE_STATE;
2925     } else {
2926     !!!cp (218.2);
2927     $self->{state} = BOGUS_MD_STATE;
2928     }
2929    
2930 wakaba 1.1 !!!next-input-character;
2931     redo A;
2932     }
2933 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2934     if ($is_space->{$self->{nc}}) {
2935     !!!cp (218.3);
2936     ## Stay in the state.
2937     !!!next-input-character;
2938     redo A;
2939     } elsif ($self->{nc} == 0x003E) { # >
2940     !!!cp (218.4);
2941     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2942     !!!next-input-character;
2943     !!!emit ($self->{ct}); # ENTITY
2944     redo A;
2945     } elsif ($self->{nc} == 0x004E or # N
2946     $self->{nc} == 0x006E) { # n
2947     !!!cp (218.5);
2948     $self->{state} = NDATA_STATE;
2949     $self->{kwd} = chr $self->{nc};
2950     !!!next-input-character;
2951     redo A;
2952     } elsif ($self->{nc} == -1) {
2953     !!!cp (218.6);
2954     !!!parse-error (type => 'unclosed md'); ## TODO: type
2955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956     ## reconsume
2957     !!!emit ($self->{ct}); # ENTITY
2958     redo A;
2959     } else {
2960     !!!cp (218.7);
2961     !!!parse-error (type => 'string after SYSTEM literal');
2962     $self->{state} = BOGUS_MD_STATE;
2963     !!!next-input-character;
2964     redo A;
2965     }
2966 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2967     if ($self->{nc} == 0x003E) { # >
2968     !!!cp (219);
2969     $self->{state} = DATA_STATE;
2970 wakaba 1.5 $self->{s_kwd} = '';
2971 wakaba 1.1 !!!next-input-character;
2972    
2973     !!!emit ($self->{ct}); # DOCTYPE
2974    
2975     redo A;
2976 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2977 wakaba 1.13 !!!cp (220.1);
2978     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2979     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2980     $self->{in_subset} = 1;
2981     !!!next-input-character;
2982     !!!emit ($self->{ct}); # DOCTYPE
2983     redo A;
2984 wakaba 1.1 } elsif ($self->{nc} == -1) {
2985     !!!cp (220);
2986     $self->{state} = DATA_STATE;
2987 wakaba 1.5 $self->{s_kwd} = '';
2988 wakaba 1.1 ## reconsume
2989    
2990     !!!emit ($self->{ct}); # DOCTYPE
2991    
2992     redo A;
2993     } else {
2994     !!!cp (221);
2995     my $s = '';
2996 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2997 wakaba 1.1
2998     ## Stay in the state
2999     !!!next-input-character;
3000     redo A;
3001     }
3002     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3003     ## NOTE: "CDATA section state" in the state is jointly implemented
3004     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3005     ## and |CDATA_SECTION_MSE2_STATE|.
3006 wakaba 1.10
3007     ## XML5: "CDATA state".
3008 wakaba 1.1
3009     if ($self->{nc} == 0x005D) { # ]
3010     !!!cp (221.1);
3011     $self->{state} = CDATA_SECTION_MSE1_STATE;
3012     !!!next-input-character;
3013     redo A;
3014     } elsif ($self->{nc} == -1) {
3015 wakaba 1.6 if ($self->{is_xml}) {
3016 wakaba 1.8 !!!cp (221.11);
3017 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3018 wakaba 1.8 } else {
3019     !!!cp (221.12);
3020 wakaba 1.6 }
3021    
3022 wakaba 1.1 $self->{state} = DATA_STATE;
3023 wakaba 1.5 $self->{s_kwd} = '';
3024 wakaba 1.10 ## Reconsume.
3025 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3026     !!!cp (221.2);
3027     !!!emit ($self->{ct}); # character
3028     } else {
3029     !!!cp (221.3);
3030     ## No token to emit. $self->{ct} is discarded.
3031     }
3032     redo A;
3033     } else {
3034     !!!cp (221.4);
3035     $self->{ct}->{data} .= chr $self->{nc};
3036     $self->{read_until}->($self->{ct}->{data},
3037     q<]>,
3038     length $self->{ct}->{data});
3039    
3040     ## Stay in the state.
3041     !!!next-input-character;
3042     redo A;
3043     }
3044    
3045     ## ISSUE: "text tokens" in spec.
3046     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3047 wakaba 1.10 ## XML5: "CDATA bracket state".
3048    
3049 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3050     !!!cp (221.5);
3051     $self->{state} = CDATA_SECTION_MSE2_STATE;
3052     !!!next-input-character;
3053     redo A;
3054     } else {
3055     !!!cp (221.6);
3056 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3057 wakaba 1.1 $self->{ct}->{data} .= ']';
3058 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3059 wakaba 1.1 ## Reconsume.
3060     redo A;
3061     }
3062     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3063 wakaba 1.10 ## XML5: "CDATA end state".
3064    
3065 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3066     $self->{state} = DATA_STATE;
3067 wakaba 1.5 $self->{s_kwd} = '';
3068 wakaba 1.1 !!!next-input-character;
3069     if (length $self->{ct}->{data}) { # character
3070     !!!cp (221.7);
3071     !!!emit ($self->{ct}); # character
3072     } else {
3073     !!!cp (221.8);
3074     ## No token to emit. $self->{ct} is discarded.
3075     }
3076     redo A;
3077     } elsif ($self->{nc} == 0x005D) { # ]
3078     !!!cp (221.9); # character
3079     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3080     ## Stay in the state.
3081     !!!next-input-character;
3082     redo A;
3083     } else {
3084     !!!cp (221.11);
3085     $self->{ct}->{data} .= ']]'; # character
3086     $self->{state} = CDATA_SECTION_STATE;
3087 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3088 wakaba 1.1 redo A;
3089     }
3090     } elsif ($self->{state} == ENTITY_STATE) {
3091     if ($is_space->{$self->{nc}} or
3092     {
3093     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3094     $self->{entity_add} => 1,
3095     }->{$self->{nc}}) {
3096 wakaba 1.22 if ($self->{is_xml}) {
3097     !!!cp (1001.1);
3098     !!!parse-error (type => 'bare ero',
3099     line => $self->{line_prev},
3100     column => $self->{column_prev}
3101     + ($self->{nc} == -1 ? 1 : 0));
3102     } else {
3103     !!!cp (1001);
3104     ## No error
3105     }
3106 wakaba 1.1 ## Don't consume
3107     ## Return nothing.
3108     #
3109     } elsif ($self->{nc} == 0x0023) { # #
3110     !!!cp (999);
3111     $self->{state} = ENTITY_HASH_STATE;
3112 wakaba 1.12 $self->{kwd} = '#';
3113 wakaba 1.1 !!!next-input-character;
3114     redo A;
3115 wakaba 1.22 } elsif ($self->{is_xml} or
3116     (0x0041 <= $self->{nc} and
3117 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3118     (0x0061 <= $self->{nc} and
3119     $self->{nc} <= 0x007A)) { # a..z
3120     !!!cp (998);
3121     require Whatpm::_NamedEntityList;
3122     $self->{state} = ENTITY_NAME_STATE;
3123 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3124     $self->{entity__value} = $self->{kwd};
3125 wakaba 1.1 $self->{entity__match} = 0;
3126     !!!next-input-character;
3127     redo A;
3128     } else {
3129     !!!cp (1027);
3130     !!!parse-error (type => 'bare ero');
3131     ## Return nothing.
3132     #
3133     }
3134    
3135     ## NOTE: No character is consumed by the "consume a character
3136     ## reference" algorithm. In other word, there is an "&" character
3137     ## that does not introduce a character reference, which would be
3138     ## appended to the parent element or the attribute value in later
3139     ## process of the tokenizer.
3140    
3141     if ($self->{prev_state} == DATA_STATE) {
3142     !!!cp (997);
3143     $self->{state} = $self->{prev_state};
3144 wakaba 1.5 $self->{s_kwd} = '';
3145 wakaba 1.1 ## Reconsume.
3146     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3147     line => $self->{line_prev},
3148     column => $self->{column_prev},
3149     });
3150     redo A;
3151     } else {
3152     !!!cp (996);
3153     $self->{ca}->{value} .= '&';
3154     $self->{state} = $self->{prev_state};
3155 wakaba 1.5 $self->{s_kwd} = '';
3156 wakaba 1.1 ## Reconsume.
3157     redo A;
3158     }
3159     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3160 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3161 wakaba 1.1 !!!cp (995);
3162     $self->{state} = HEXREF_X_STATE;
3163 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3164 wakaba 1.1 !!!next-input-character;
3165     redo A;
3166 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3167     !!!cp (995.1);
3168     if ($self->{is_xml}) {
3169     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3170     }
3171     $self->{state} = HEXREF_X_STATE;
3172     $self->{kwd} .= chr $self->{nc};
3173     !!!next-input-character;
3174     redo A;
3175 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3176     $self->{nc} <= 0x0039) { # 0..9
3177     !!!cp (994);
3178     $self->{state} = NCR_NUM_STATE;
3179 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3180 wakaba 1.1 !!!next-input-character;
3181     redo A;
3182     } else {
3183     !!!parse-error (type => 'bare nero',
3184     line => $self->{line_prev},
3185     column => $self->{column_prev} - 1);
3186    
3187     ## NOTE: According to the spec algorithm, nothing is returned,
3188     ## and then "&#" is appended to the parent element or the attribute
3189     ## value in the later processing.
3190    
3191     if ($self->{prev_state} == DATA_STATE) {
3192     !!!cp (1019);
3193     $self->{state} = $self->{prev_state};
3194 wakaba 1.5 $self->{s_kwd} = '';
3195 wakaba 1.1 ## Reconsume.
3196     !!!emit ({type => CHARACTER_TOKEN,
3197     data => '&#',
3198     line => $self->{line_prev},
3199     column => $self->{column_prev} - 1,
3200     });
3201     redo A;
3202     } else {
3203     !!!cp (993);
3204     $self->{ca}->{value} .= '&#';
3205     $self->{state} = $self->{prev_state};
3206 wakaba 1.5 $self->{s_kwd} = '';
3207 wakaba 1.1 ## Reconsume.
3208     redo A;
3209     }
3210     }
3211     } elsif ($self->{state} == NCR_NUM_STATE) {
3212     if (0x0030 <= $self->{nc} and
3213     $self->{nc} <= 0x0039) { # 0..9
3214     !!!cp (1012);
3215 wakaba 1.12 $self->{kwd} *= 10;
3216     $self->{kwd} += $self->{nc} - 0x0030;
3217 wakaba 1.1
3218     ## Stay in the state.
3219     !!!next-input-character;
3220     redo A;
3221     } elsif ($self->{nc} == 0x003B) { # ;
3222     !!!cp (1013);
3223     !!!next-input-character;
3224     #
3225     } else {
3226     !!!cp (1014);
3227     !!!parse-error (type => 'no refc');
3228     ## Reconsume.
3229     #
3230     }
3231    
3232 wakaba 1.12 my $code = $self->{kwd};
3233 wakaba 1.1 my $l = $self->{line_prev};
3234     my $c = $self->{column_prev};
3235 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3236     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3237     ($self->{is_xml} and $code == 0x0000)) {
3238 wakaba 1.1 !!!cp (1015);
3239     !!!parse-error (type => 'invalid character reference',
3240     text => (sprintf 'U+%04X', $code),
3241     line => $l, column => $c);
3242     $code = $charref_map->{$code};
3243     } elsif ($code > 0x10FFFF) {
3244     !!!cp (1016);
3245     !!!parse-error (type => 'invalid character reference',
3246     text => (sprintf 'U-%08X', $code),
3247     line => $l, column => $c);
3248     $code = 0xFFFD;
3249     }
3250    
3251     if ($self->{prev_state} == DATA_STATE) {
3252     !!!cp (992);
3253     $self->{state} = $self->{prev_state};
3254 wakaba 1.5 $self->{s_kwd} = '';
3255 wakaba 1.1 ## Reconsume.
3256     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3257 wakaba 1.7 has_reference => 1,
3258 wakaba 1.1 line => $l, column => $c,
3259     });
3260     redo A;
3261     } else {
3262     !!!cp (991);
3263     $self->{ca}->{value} .= chr $code;
3264     $self->{ca}->{has_reference} = 1;
3265     $self->{state} = $self->{prev_state};
3266 wakaba 1.5 $self->{s_kwd} = '';
3267 wakaba 1.1 ## Reconsume.
3268     redo A;
3269     }
3270     } elsif ($self->{state} == HEXREF_X_STATE) {
3271     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3272     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3273     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3274     # 0..9, A..F, a..f
3275     !!!cp (990);
3276     $self->{state} = HEXREF_HEX_STATE;
3277 wakaba 1.12 $self->{kwd} = 0;
3278 wakaba 1.1 ## Reconsume.
3279     redo A;
3280     } else {
3281     !!!parse-error (type => 'bare hcro',
3282     line => $self->{line_prev},
3283     column => $self->{column_prev} - 2);
3284    
3285     ## NOTE: According to the spec algorithm, nothing is returned,
3286     ## and then "&#" followed by "X" or "x" is appended to the parent
3287     ## element or the attribute value in the later processing.
3288    
3289     if ($self->{prev_state} == DATA_STATE) {
3290     !!!cp (1005);
3291     $self->{state} = $self->{prev_state};
3292 wakaba 1.5 $self->{s_kwd} = '';
3293 wakaba 1.1 ## Reconsume.
3294     !!!emit ({type => CHARACTER_TOKEN,
3295 wakaba 1.12 data => '&' . $self->{kwd},
3296 wakaba 1.1 line => $self->{line_prev},
3297 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3298 wakaba 1.1 });
3299     redo A;
3300     } else {
3301     !!!cp (989);
3302 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3303 wakaba 1.1 $self->{state} = $self->{prev_state};
3304 wakaba 1.5 $self->{s_kwd} = '';
3305 wakaba 1.1 ## Reconsume.
3306     redo A;
3307     }
3308     }
3309     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3310     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3311     # 0..9
3312     !!!cp (1002);
3313 wakaba 1.12 $self->{kwd} *= 0x10;
3314     $self->{kwd} += $self->{nc} - 0x0030;
3315 wakaba 1.1 ## Stay in the state.
3316     !!!next-input-character;
3317     redo A;
3318     } elsif (0x0061 <= $self->{nc} and
3319     $self->{nc} <= 0x0066) { # a..f
3320     !!!cp (1003);
3321 wakaba 1.12 $self->{kwd} *= 0x10;
3322     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3323 wakaba 1.1 ## Stay in the state.
3324     !!!next-input-character;
3325     redo A;
3326     } elsif (0x0041 <= $self->{nc} and
3327     $self->{nc} <= 0x0046) { # A..F
3328     !!!cp (1004);
3329 wakaba 1.12 $self->{kwd} *= 0x10;
3330     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3331 wakaba 1.1 ## Stay in the state.
3332     !!!next-input-character;
3333     redo A;
3334     } elsif ($self->{nc} == 0x003B) { # ;
3335     !!!cp (1006);
3336     !!!next-input-character;
3337     #
3338     } else {
3339     !!!cp (1007);
3340     !!!parse-error (type => 'no refc',
3341     line => $self->{line},
3342     column => $self->{column});
3343     ## Reconsume.
3344     #
3345     }
3346    
3347 wakaba 1.12 my $code = $self->{kwd};
3348 wakaba 1.1 my $l = $self->{line_prev};
3349     my $c = $self->{column_prev};
3350 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3351     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3352     ($self->{is_xml} and $code == 0x0000)) {
3353 wakaba 1.1 !!!cp (1008);
3354     !!!parse-error (type => 'invalid character reference',
3355     text => (sprintf 'U+%04X', $code),
3356     line => $l, column => $c);
3357     $code = $charref_map->{$code};
3358     } elsif ($code > 0x10FFFF) {
3359     !!!cp (1009);
3360     !!!parse-error (type => 'invalid character reference',
3361     text => (sprintf 'U-%08X', $code),
3362     line => $l, column => $c);
3363     $code = 0xFFFD;
3364     }
3365    
3366     if ($self->{prev_state} == DATA_STATE) {
3367     !!!cp (988);
3368     $self->{state} = $self->{prev_state};
3369 wakaba 1.5 $self->{s_kwd} = '';
3370 wakaba 1.1 ## Reconsume.
3371     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3372 wakaba 1.7 has_reference => 1,
3373 wakaba 1.1 line => $l, column => $c,
3374     });
3375     redo A;
3376     } else {
3377     !!!cp (987);
3378     $self->{ca}->{value} .= chr $code;
3379     $self->{ca}->{has_reference} = 1;
3380     $self->{state} = $self->{prev_state};
3381 wakaba 1.5 $self->{s_kwd} = '';
3382 wakaba 1.1 ## Reconsume.
3383     redo A;
3384     }
3385     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3386 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3387     $self->{nc} <= 0x005A) or # x
3388     (0x0061 <= $self->{nc} and # a
3389     $self->{nc} <= 0x007A) or # z
3390     (0x0030 <= $self->{nc} and # 0
3391     $self->{nc} <= 0x0039) or # 9
3392 wakaba 1.22 $self->{nc} == 0x003B or # ;
3393     ($self->{is_xml} and
3394     not ($is_space->{$self->{nc}} or
3395     {
3396     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3397     $self->{entity_add} => 1,
3398     }->{$self->{nc}}))) {
3399 wakaba 1.1 our $EntityChar;
3400 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3401 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3402     $self->{ge}->{$self->{kwd}}) {
3403 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3404 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3405     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3406     !!!cp (1020.1);
3407     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3408     } else {
3409     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3410     !!!cp (1020.2);
3411     !!!parse-error (type => 'unparsed entity', ## TODO: type
3412     value => $self->{kwd});
3413     } else {
3414     !!!cp (1020.3);
3415     }
3416     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3417     }
3418     } else {
3419     if ($self->{is_xml}) {
3420     !!!cp (1020.4);
3421     !!!parse-error (type => 'entity not declared', ## TODO: type
3422     value => $self->{kwd},
3423     level => {
3424     'amp;' => $self->{level}->{warn},
3425     'quot;' => $self->{level}->{warn},
3426     'lt;' => $self->{level}->{warn},
3427     'gt;' => $self->{level}->{warn},
3428     'apos;' => $self->{level}->{warn},
3429     }->{$self->{kwd}} ||
3430     $self->{level}->{must});
3431     } else {
3432     !!!cp (1020);
3433     }
3434     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3435     }
3436 wakaba 1.1 $self->{entity__match} = 1;
3437     !!!next-input-character;
3438     #
3439     } else {
3440     !!!cp (1021);
3441 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3442 wakaba 1.1 $self->{entity__match} = -1;
3443     ## Stay in the state.
3444     !!!next-input-character;
3445     redo A;
3446     }
3447     } else {
3448     !!!cp (1022);
3449     $self->{entity__value} .= chr $self->{nc};
3450     $self->{entity__match} *= 2;
3451     ## Stay in the state.
3452     !!!next-input-character;
3453     redo A;
3454     }
3455     }
3456    
3457     my $data;
3458     my $has_ref;
3459     if ($self->{entity__match} > 0) {
3460     !!!cp (1023);
3461     $data = $self->{entity__value};
3462     $has_ref = 1;
3463     #
3464     } elsif ($self->{entity__match} < 0) {
3465     !!!parse-error (type => 'no refc');
3466     if ($self->{prev_state} != DATA_STATE and # in attribute
3467     $self->{entity__match} < -1) {
3468     !!!cp (1024);
3469 wakaba 1.12 $data = '&' . $self->{kwd};
3470 wakaba 1.1 #
3471     } else {
3472     !!!cp (1025);
3473     $data = $self->{entity__value};
3474     $has_ref = 1;
3475     #
3476     }
3477     } else {
3478     !!!cp (1026);
3479     !!!parse-error (type => 'bare ero',
3480     line => $self->{line_prev},
3481 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3482     $data = '&' . $self->{kwd};
3483 wakaba 1.1 #
3484     }
3485    
3486     ## NOTE: In these cases, when a character reference is found,
3487     ## it is consumed and a character token is returned, or, otherwise,
3488     ## nothing is consumed and returned, according to the spec algorithm.
3489     ## In this implementation, anything that has been examined by the
3490     ## tokenizer is appended to the parent element or the attribute value
3491     ## as string, either literal string when no character reference or
3492     ## entity-replaced string otherwise, in this stage, since any characters
3493     ## that would not be consumed are appended in the data state or in an
3494     ## appropriate attribute value state anyway.
3495    
3496     if ($self->{prev_state} == DATA_STATE) {
3497     !!!cp (986);
3498     $self->{state} = $self->{prev_state};
3499 wakaba 1.5 $self->{s_kwd} = '';
3500 wakaba 1.1 ## Reconsume.
3501     !!!emit ({type => CHARACTER_TOKEN,
3502     data => $data,
3503 wakaba 1.7 has_reference => $has_ref,
3504 wakaba 1.1 line => $self->{line_prev},
3505 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3506 wakaba 1.1 });
3507     redo A;
3508     } else {
3509     !!!cp (985);
3510     $self->{ca}->{value} .= $data;
3511     $self->{ca}->{has_reference} = 1 if $has_ref;
3512     $self->{state} = $self->{prev_state};
3513 wakaba 1.5 $self->{s_kwd} = '';
3514 wakaba 1.1 ## Reconsume.
3515     redo A;
3516     }
3517 wakaba 1.8
3518     ## XML-only states
3519    
3520     } elsif ($self->{state} == PI_STATE) {
3521 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3522    
3523 wakaba 1.8 if ($is_space->{$self->{nc}} or
3524 wakaba 1.14 $self->{nc} == 0x003F or # ?
3525 wakaba 1.8 $self->{nc} == -1) {
3526 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3527     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3528     ## "DOCTYPE pi state": Parse error, switch to the "data
3529     ## state".
3530 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3531     line => $self->{line_prev},
3532     column => $self->{column_prev}
3533     - 1 * ($self->{nc} != -1));
3534     $self->{state} = BOGUS_COMMENT_STATE;
3535     ## Reconsume.
3536     $self->{ct} = {type => COMMENT_TOKEN,
3537     data => '?',
3538     line => $self->{line_prev},
3539     column => $self->{column_prev}
3540     - 1 * ($self->{nc} != -1),
3541     };
3542     redo A;
3543     } else {
3544 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3545 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3546     target => chr $self->{nc},
3547     data => '',
3548     line => $self->{line_prev},
3549     column => $self->{column_prev} - 1,
3550     };
3551     $self->{state} = PI_TARGET_STATE;
3552     !!!next-input-character;
3553     redo A;
3554     }
3555     } elsif ($self->{state} == PI_TARGET_STATE) {
3556     if ($is_space->{$self->{nc}}) {
3557     $self->{state} = PI_TARGET_AFTER_STATE;
3558     !!!next-input-character;
3559     redo A;
3560     } elsif ($self->{nc} == -1) {
3561     !!!parse-error (type => 'no pic'); ## TODO: type
3562 wakaba 1.13 if ($self->{in_subset}) {
3563     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3564     } else {
3565     $self->{state} = DATA_STATE;
3566     $self->{s_kwd} = '';
3567     }
3568 wakaba 1.8 ## Reconsume.
3569     !!!emit ($self->{ct}); # pi
3570     redo A;
3571     } elsif ($self->{nc} == 0x003F) { # ?
3572     $self->{state} = PI_AFTER_STATE;
3573     !!!next-input-character;
3574     redo A;
3575     } else {
3576     ## XML5: typo ("tag name" -> "target")
3577     $self->{ct}->{target} .= chr $self->{nc}; # pi
3578     !!!next-input-character;
3579     redo A;
3580     }
3581     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3582     if ($is_space->{$self->{nc}}) {
3583     ## Stay in the state.
3584     !!!next-input-character;
3585     redo A;
3586     } else {
3587     $self->{state} = PI_DATA_STATE;
3588     ## Reprocess.
3589     redo A;
3590     }
3591     } elsif ($self->{state} == PI_DATA_STATE) {
3592     if ($self->{nc} == 0x003F) { # ?
3593     $self->{state} = PI_DATA_AFTER_STATE;
3594     !!!next-input-character;
3595     redo A;
3596     } elsif ($self->{nc} == -1) {
3597     !!!parse-error (type => 'no pic'); ## TODO: type
3598 wakaba 1.13 if ($self->{in_subset}) {
3599 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3600 wakaba 1.13 } else {
3601     $self->{state} = DATA_STATE;
3602     $self->{s_kwd} = '';
3603     }
3604 wakaba 1.8 ## Reprocess.
3605     !!!emit ($self->{ct}); # pi
3606     redo A;
3607     } else {
3608     $self->{ct}->{data} .= chr $self->{nc}; # pi
3609     $self->{read_until}->($self->{ct}->{data}, q[?],
3610     length $self->{ct}->{data});
3611     ## Stay in the state.
3612     !!!next-input-character;
3613     ## Reprocess.
3614     redo A;
3615     }
3616     } elsif ($self->{state} == PI_AFTER_STATE) {
3617 wakaba 1.14 ## XML5: Part of "Pi after state".
3618    
3619 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3620 wakaba 1.13 if ($self->{in_subset}) {
3621     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3622     } else {
3623     $self->{state} = DATA_STATE;
3624     $self->{s_kwd} = '';
3625     }
3626 wakaba 1.8 !!!next-input-character;
3627     !!!emit ($self->{ct}); # pi
3628     redo A;
3629     } elsif ($self->{nc} == 0x003F) { # ?
3630     !!!parse-error (type => 'no s after target', ## TODO: type
3631     line => $self->{line_prev},
3632     column => $self->{column_prev}); ## XML5: no error
3633     $self->{ct}->{data} .= '?';
3634     $self->{state} = PI_DATA_AFTER_STATE;
3635     !!!next-input-character;
3636     redo A;
3637     } else {
3638     !!!parse-error (type => 'no s after target', ## TODO: type
3639     line => $self->{line_prev},
3640     column => $self->{column_prev}
3641     + 1 * ($self->{nc} == -1)); ## XML5: no error
3642     $self->{ct}->{data} .= '?'; ## XML5: not appended
3643     $self->{state} = PI_DATA_STATE;
3644     ## Reprocess.
3645     redo A;
3646     }
3647     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3648 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3649    
3650 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3651 wakaba 1.13 if ($self->{in_subset}) {
3652     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3653     } else {
3654     $self->{state} = DATA_STATE;
3655     $self->{s_kwd} = '';
3656     }
3657 wakaba 1.8 !!!next-input-character;
3658     !!!emit ($self->{ct}); # pi
3659     redo A;
3660     } elsif ($self->{nc} == 0x003F) { # ?
3661     $self->{ct}->{data} .= '?';
3662     ## Stay in the state.
3663     !!!next-input-character;
3664     redo A;
3665     } else {
3666     $self->{ct}->{data} .= '?'; ## XML5: not appended
3667     $self->{state} = PI_DATA_STATE;
3668     ## Reprocess.
3669     redo A;
3670     }
3671 wakaba 1.12
3672     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3673     if ($self->{nc} == 0x003C) { # <
3674 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3675 wakaba 1.12 !!!next-input-character;
3676     redo A;
3677     } elsif ($self->{nc} == 0x0025) { # %
3678     ## XML5: Not defined yet.
3679    
3680     ## TODO:
3681 wakaba 1.24
3682     if (not $self->{stop_processing} and
3683     not $self->{document}->xml_standalone) {
3684     !!!parse-error (type => 'stop processing', ## TODO: type
3685     level => $self->{level}->{info});
3686     $self->{stop_processing} = 1;
3687     }
3688    
3689 wakaba 1.12 !!!next-input-character;
3690     redo A;
3691     } elsif ($self->{nc} == 0x005D) { # ]
3692 wakaba 1.13 delete $self->{in_subset};
3693 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3694     !!!next-input-character;
3695     redo A;
3696     } elsif ($is_space->{$self->{nc}}) {
3697     ## Stay in the state.
3698     !!!next-input-character;
3699     redo A;
3700     } elsif ($self->{nc} == -1) {
3701     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3702 wakaba 1.13 delete $self->{in_subset};
3703 wakaba 1.12 $self->{state} = DATA_STATE;
3704     $self->{s_kwd} = '';
3705     ## Reconsume.
3706 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3707 wakaba 1.12 redo A;
3708     } else {
3709     unless ($self->{internal_subset_tainted}) {
3710     ## XML5: No parse error.
3711     !!!parse-error (type => 'string in internal subset');
3712     $self->{internal_subset_tainted} = 1;
3713     }
3714     ## Stay in the state.
3715     !!!next-input-character;
3716     redo A;
3717     }
3718     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3719     if ($self->{nc} == 0x003E) { # >
3720     $self->{state} = DATA_STATE;
3721     $self->{s_kwd} = '';
3722     !!!next-input-character;
3723 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3724 wakaba 1.12 redo A;
3725     } elsif ($self->{nc} == -1) {
3726     !!!parse-error (type => 'unclosed DOCTYPE');
3727     $self->{state} = DATA_STATE;
3728     $self->{s_kwd} = '';
3729     ## Reconsume.
3730 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3731 wakaba 1.12 redo A;
3732     } else {
3733     ## XML5: No parse error and stay in the state.
3734     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3735    
3736 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3737     !!!next-input-character;
3738     redo A;
3739     }
3740     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3741     if ($self->{nc} == 0x003E) { # >
3742     $self->{state} = DATA_STATE;
3743     $self->{s_kwd} = '';
3744     !!!next-input-character;
3745     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3746     redo A;
3747     } elsif ($self->{nc} == -1) {
3748     $self->{state} = DATA_STATE;
3749     $self->{s_kwd} = '';
3750     ## Reconsume.
3751     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3752     redo A;
3753     } else {
3754     ## Stay in the state.
3755     !!!next-input-character;
3756     redo A;
3757     }
3758     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3759     if ($self->{nc} == 0x0021) { # !
3760 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3761 wakaba 1.13 !!!next-input-character;
3762     redo A;
3763     } elsif ($self->{nc} == 0x003F) { # ?
3764     $self->{state} = PI_STATE;
3765     !!!next-input-character;
3766     redo A;
3767     } elsif ($self->{nc} == -1) {
3768     !!!parse-error (type => 'bare stago');
3769     $self->{state} = DATA_STATE;
3770     $self->{s_kwd} = '';
3771     ## Reconsume.
3772     redo A;
3773     } else {
3774     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3775     line => $self->{line_prev},
3776     column => $self->{column_prev});
3777     $self->{state} = BOGUS_COMMENT_STATE;
3778     $self->{ct} = {type => COMMENT_TOKEN,
3779     data => '',
3780     }; ## NOTE: Will be discarded.
3781 wakaba 1.12 !!!next-input-character;
3782     redo A;
3783     }
3784 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3785     ## XML5: "DOCTYPE markup declaration state".
3786    
3787     if ($self->{nc} == 0x002D) { # -
3788     $self->{state} = MD_HYPHEN_STATE;
3789     !!!next-input-character;
3790     redo A;
3791 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3792     $self->{nc} == 0x0065) { # e
3793 wakaba 1.14 $self->{state} = MD_E_STATE;
3794     $self->{kwd} = chr $self->{nc};
3795     !!!next-input-character;
3796     redo A;
3797 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3798     $self->{nc} == 0x0061) { # a
3799 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3800     $self->{kwd} = chr $self->{nc};
3801     !!!next-input-character;
3802     redo A;
3803 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3804     $self->{nc} == 0x006E) { # n
3805 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3806     $self->{kwd} = chr $self->{nc};
3807     !!!next-input-character;
3808     redo A;
3809     } else {
3810     #
3811     }
3812    
3813     ## XML5: No parse error.
3814     !!!parse-error (type => 'bogus comment',
3815     line => $self->{line_prev},
3816     column => $self->{column_prev} - 1);
3817     ## Reconsume.
3818     $self->{state} = BOGUS_COMMENT_STATE;
3819     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3820     redo A;
3821     } elsif ($self->{state} == MD_E_STATE) {
3822 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3823     $self->{nc} == 0x006E) { # n
3824 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3825     $self->{kwd} .= chr $self->{nc};
3826     !!!next-input-character;
3827     redo A;
3828 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3829     $self->{nc} == 0x006C) { # l
3830 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3831     $self->{state} = MD_ELEMENT_STATE;
3832     $self->{kwd} .= chr $self->{nc};
3833     !!!next-input-character;
3834     redo A;
3835     } else {
3836     ## XML5: No parse error.
3837     !!!parse-error (type => 'bogus comment',
3838     line => $self->{line_prev},
3839     column => $self->{column_prev} - 2
3840     + 1 * ($self->{nc} == -1));
3841     ## Reconsume.
3842     $self->{state} = BOGUS_COMMENT_STATE;
3843     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3844     redo A;
3845     }
3846     } elsif ($self->{state} == MD_ENTITY_STATE) {
3847 wakaba 1.17 if ($self->{nc} == [
3848     undef,
3849     undef,
3850     0x0054, # T
3851     0x0049, # I
3852     0x0054, # T
3853     ]->[length $self->{kwd}] or
3854     $self->{nc} == [
3855     undef,
3856     undef,
3857     0x0074, # t
3858     0x0069, # i
3859     0x0074, # t
3860     ]->[length $self->{kwd}]) {
3861 wakaba 1.14 ## Stay in the state.
3862     $self->{kwd} .= chr $self->{nc};
3863     !!!next-input-character;
3864     redo A;
3865 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3866     ($self->{nc} == 0x0059 or # Y
3867     $self->{nc} == 0x0079)) { # y
3868     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3869     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3870     text => 'ENTITY',
3871     line => $self->{line_prev},
3872     column => $self->{column_prev} - 4);
3873     }
3874     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3875 wakaba 1.14 line => $self->{line_prev},
3876     column => $self->{column_prev} - 6};
3877     $self->{state} = DOCTYPE_MD_STATE;
3878     !!!next-input-character;
3879     redo A;
3880     } else {
3881     !!!parse-error (type => 'bogus comment',
3882     line => $self->{line_prev},
3883     column => $self->{column_prev} - 1
3884     - (length $self->{kwd})
3885     + 1 * ($self->{nc} == -1));
3886     $self->{state} = BOGUS_COMMENT_STATE;
3887     ## Reconsume.
3888     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3889     redo A;
3890     }
3891     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3892 wakaba 1.17 if ($self->{nc} == [
3893     undef,
3894     undef,
3895     0x0045, # E
3896     0x004D, # M
3897     0x0045, # E
3898     0x004E, # N
3899     ]->[length $self->{kwd}] or
3900     $self->{nc} == [
3901     undef,
3902     undef,
3903     0x0065, # e
3904     0x006D, # m
3905     0x0065, # e
3906     0x006E, # n
3907     ]->[length $self->{kwd}]) {
3908 wakaba 1.14 ## Stay in the state.
3909     $self->{kwd} .= chr $self->{nc};
3910     !!!next-input-character;
3911     redo A;
3912 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3913     ($self->{nc} == 0x0054 or # T
3914     $self->{nc} == 0x0074)) { # t
3915     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3916     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3917     text => 'ELEMENT',
3918     line => $self->{line_prev},
3919     column => $self->{column_prev} - 5);
3920     }
3921 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3922     line => $self->{line_prev},
3923 wakaba 1.23 column => $self->{column_prev} - 7};
3924 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3925     !!!next-input-character;
3926     redo A;
3927     } else {
3928     !!!parse-error (type => 'bogus comment',
3929     line => $self->{line_prev},
3930     column => $self->{column_prev} - 1
3931     - (length $self->{kwd})
3932     + 1 * ($self->{nc} == -1));
3933     $self->{state} = BOGUS_COMMENT_STATE;
3934     ## Reconsume.
3935     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3936     redo A;
3937     }
3938     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3939 wakaba 1.17 if ($self->{nc} == [
3940     undef,
3941     0x0054, # T
3942     0x0054, # T
3943     0x004C, # L
3944     0x0049, # I
3945     0x0053, # S
3946     ]->[length $self->{kwd}] or
3947     $self->{nc} == [
3948     undef,
3949     0x0074, # t
3950     0x0074, # t
3951     0x006C, # l
3952     0x0069, # i
3953     0x0073, # s
3954     ]->[length $self->{kwd}]) {
3955 wakaba 1.14 ## Stay in the state.
3956     $self->{kwd} .= chr $self->{nc};
3957     !!!next-input-character;
3958     redo A;
3959 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3960     ($self->{nc} == 0x0054 or # T
3961     $self->{nc} == 0x0074)) { # t
3962     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3963     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3964     text => 'ATTLIST',
3965     line => $self->{line_prev},
3966     column => $self->{column_prev} - 5);
3967     }
3968 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3969 wakaba 1.15 attrdefs => [],
3970 wakaba 1.14 line => $self->{line_prev},
3971 wakaba 1.23 column => $self->{column_prev} - 7};
3972 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3973     !!!next-input-character;
3974     redo A;
3975     } else {
3976     !!!parse-error (type => 'bogus comment',
3977     line => $self->{line_prev},
3978     column => $self->{column_prev} - 1
3979     - (length $self->{kwd})
3980     + 1 * ($self->{nc} == -1));
3981     $self->{state} = BOGUS_COMMENT_STATE;
3982     ## Reconsume.
3983     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3984     redo A;
3985     }
3986     } elsif ($self->{state} == MD_NOTATION_STATE) {
3987 wakaba 1.17 if ($self->{nc} == [
3988     undef,
3989     0x004F, # O
3990     0x0054, # T
3991     0x0041, # A
3992     0x0054, # T
3993     0x0049, # I
3994     0x004F, # O
3995     ]->[length $self->{kwd}] or
3996     $self->{nc} == [
3997     undef,
3998     0x006F, # o
3999     0x0074, # t
4000     0x0061, # a
4001     0x0074, # t
4002     0x0069, # i
4003     0x006F, # o
4004     ]->[length $self->{kwd}]) {
4005 wakaba 1.14 ## Stay in the state.
4006     $self->{kwd} .= chr $self->{nc};
4007     !!!next-input-character;
4008     redo A;
4009 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4010     ($self->{nc} == 0x004E or # N
4011     $self->{nc} == 0x006E)) { # n
4012     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4013     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4014     text => 'NOTATION',
4015     line => $self->{line_prev},
4016     column => $self->{column_prev} - 6);
4017     }
4018 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4019     line => $self->{line_prev},
4020 wakaba 1.23 column => $self->{column_prev} - 8};
4021 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4022     !!!next-input-character;
4023     redo A;
4024     } else {
4025     !!!parse-error (type => 'bogus comment',
4026     line => $self->{line_prev},
4027     column => $self->{column_prev} - 1
4028     - (length $self->{kwd})
4029     + 1 * ($self->{nc} == -1));
4030     $self->{state} = BOGUS_COMMENT_STATE;
4031     ## Reconsume.
4032     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4033     redo A;
4034     }
4035     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4036     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4037     ## "DOCTYPE NOTATION state".
4038    
4039     if ($is_space->{$self->{nc}}) {
4040     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4041     $self->{state} = BEFORE_MD_NAME_STATE;
4042     !!!next-input-character;
4043     redo A;
4044     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4045     $self->{nc} == 0x0025) { # %
4046     ## XML5: Switch to the "DOCTYPE bogus comment state".
4047     !!!parse-error (type => 'no space before md name'); ## TODO: type
4048     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4049     !!!next-input-character;
4050     redo A;
4051     } elsif ($self->{nc} == -1) {
4052     !!!parse-error (type => 'unclosed md'); ## TODO: type
4053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4054     ## Reconsume.
4055     redo A;
4056     } elsif ($self->{nc} == 0x003E) { # >
4057     ## XML5: Switch to the "DOCTYPE bogus comment state".
4058     !!!parse-error (type => 'no md name'); ## TODO: type
4059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060     !!!next-input-character;
4061     redo A;
4062     } else {
4063     ## XML5: Switch to the "DOCTYPE bogus comment state".
4064     !!!parse-error (type => 'no space before md name'); ## TODO: type
4065     $self->{state} = BEFORE_MD_NAME_STATE;
4066     redo A;
4067     }
4068     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4069     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4070     ## before state", "DOCTYPE ATTLIST name before state".
4071    
4072     if ($is_space->{$self->{nc}}) {
4073     ## Stay in the state.
4074     !!!next-input-character;
4075     redo A;
4076     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4077     $self->{nc} == 0x0025) { # %
4078     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4079     !!!next-input-character;
4080     redo A;
4081     } elsif ($self->{nc} == 0x003E) { # >
4082     ## XML5: Same as "Anything else".
4083     !!!parse-error (type => 'no md name'); ## TODO: type
4084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4085     !!!next-input-character;
4086     redo A;
4087     } elsif ($self->{nc} == -1) {
4088     !!!parse-error (type => 'unclosed md'); ## TODO: type
4089     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4090     ## Reconsume.
4091     redo A;
4092     } else {
4093     ## XML5: [ATTLIST] Not defined yet.
4094     $self->{ct}->{name} .= chr $self->{nc};
4095     $self->{state} = MD_NAME_STATE;
4096     !!!next-input-character;
4097     redo A;
4098     }
4099     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4100     if ($is_space->{$self->{nc}}) {
4101     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4102     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4103     $self->{state} = BEFORE_MD_NAME_STATE;
4104     !!!next-input-character;
4105     redo A;
4106     } elsif ($self->{nc} == 0x003E) { # >
4107     ## XML5: Same as "Anything else".
4108     !!!parse-error (type => 'no md name'); ## TODO: type
4109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4110     !!!next-input-character;
4111     redo A;
4112     } elsif ($self->{nc} == -1) {
4113     !!!parse-error (type => 'unclosed md');
4114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4115     ## Reconsume.
4116     redo A;
4117     } else {
4118     ## XML5: No parse error.
4119     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4120     $self->{state} = BOGUS_COMMENT_STATE;
4121     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4122     ## Reconsume.
4123     redo A;
4124     }
4125     } elsif ($self->{state} == MD_NAME_STATE) {
4126     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4127    
4128     if ($is_space->{$self->{nc}}) {
4129 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4130     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4131     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4132 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4133 wakaba 1.16 } else { # ENTITY/NOTATION
4134     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4135     }
4136 wakaba 1.14 !!!next-input-character;
4137     redo A;
4138     } elsif ($self->{nc} == 0x003E) { # >
4139     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4140     #
4141     } else {
4142 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4143 wakaba 1.14 }
4144     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4145     !!!next-input-character;
4146     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4147     redo A;
4148     } elsif ($self->{nc} == -1) {
4149     ## XML5: [ATTLIST] No parse error.
4150     !!!parse-error (type => 'unclosed md');
4151     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4152     ## Reconsume.
4153     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4154     redo A;
4155     } else {
4156     ## XML5: [ATTLIST] Not defined yet.
4157     $self->{ct}->{name} .= chr $self->{nc};
4158     ## Stay in the state.
4159     !!!next-input-character;
4160     redo A;
4161     }
4162     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4163     if ($is_space->{$self->{nc}}) {
4164     ## Stay in the state.
4165     !!!next-input-character;
4166     redo A;
4167     } elsif ($self->{nc} == 0x003E) { # >
4168     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4169     !!!next-input-character;
4170     !!!emit ($self->{ct}); # ATTLIST
4171     redo A;
4172     } elsif ($self->{nc} == -1) {
4173     ## XML5: No parse error.
4174     !!!parse-error (type => 'unclosed md'); ## TODO: type
4175     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4176 wakaba 1.15 !!!emit ($self->{ct});
4177     redo A;
4178     } else {
4179     ## XML5: Not defined yet.
4180     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4181     tokens => [],
4182     line => $self->{line}, column => $self->{column}};
4183     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4184     !!!next-input-character;
4185     redo A;
4186     }
4187     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4188     if ($is_space->{$self->{nc}}) {
4189     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4190     !!!next-input-character;
4191     redo A;
4192     } elsif ($self->{nc} == 0x003E) { # >
4193     ## XML5: Same as "anything else".
4194     !!!parse-error (type => 'no attr type'); ## TODO: type
4195     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4196     !!!next-input-character;
4197     !!!emit ($self->{ct}); # ATTLIST
4198     redo A;
4199     } elsif ($self->{nc} == 0x0028) { # (
4200     ## XML5: Same as "anything else".
4201     !!!parse-error (type => 'no space before paren'); ## TODO: type
4202     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4203     !!!next-input-character;
4204     redo A;
4205     } elsif ($self->{nc} == -1) {
4206     ## XML5: No parse error.
4207     !!!parse-error (type => 'unclosed md'); ## TODO: type
4208     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4209     !!!next-input-character;
4210     !!!emit ($self->{ct}); # ATTLIST
4211     redo A;
4212     } else {
4213     ## XML5: Not defined yet.
4214     $self->{ca}->{name} .= chr $self->{nc};
4215     ## Stay in the state.
4216     !!!next-input-character;
4217     redo A;
4218     }
4219     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4220     if ($is_space->{$self->{nc}}) {
4221     ## Stay in the state.
4222     !!!next-input-character;
4223     redo A;
4224     } elsif ($self->{nc} == 0x003E) { # >
4225     ## XML5: Same as "anything else".
4226     !!!parse-error (type => 'no attr type'); ## TODO: type
4227     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4228     !!!next-input-character;
4229     !!!emit ($self->{ct}); # ATTLIST
4230     redo A;
4231     } elsif ($self->{nc} == 0x0028) { # (
4232     ## XML5: Same as "anything else".
4233     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4234     !!!next-input-character;
4235     redo A;
4236     } elsif ($self->{nc} == -1) {
4237     ## XML5: No parse error.
4238     !!!parse-error (type => 'unclosed md'); ## TODO: type
4239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4240     !!!next-input-character;
4241     !!!emit ($self->{ct});
4242 wakaba 1.14 redo A;
4243     } else {
4244     ## XML5: Not defined yet.
4245 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4246     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4247     !!!next-input-character;
4248     redo A;
4249     }
4250     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4251     if ($is_space->{$self->{nc}}) {
4252     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4253     !!!next-input-character;
4254     redo A;
4255     } elsif ($self->{nc} == 0x0023) { # #
4256     ## XML5: Same as "anything else".
4257     !!!parse-error (type => 'no space before default value'); ## TODO: type
4258     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4259     !!!next-input-character;
4260     redo A;
4261     } elsif ($self->{nc} == 0x0022) { # "
4262     ## XML5: Same as "anything else".
4263     !!!parse-error (type => 'no space before default value'); ## TODO: type
4264     $self->{ca}->{value} = '';
4265     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4266     !!!next-input-character;
4267     redo A;
4268     } elsif ($self->{nc} == 0x0027) { # '
4269     ## XML5: Same as "anything else".
4270     !!!parse-error (type => 'no space before default value'); ## TODO: type
4271     $self->{ca}->{value} = '';
4272     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4273     !!!next-input-character;
4274     redo A;
4275     } elsif ($self->{nc} == 0x003E) { # >
4276     ## XML5: Same as "anything else".
4277     !!!parse-error (type => 'no attr default'); ## TODO: type
4278     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4279     !!!next-input-character;
4280     !!!emit ($self->{ct}); # ATTLIST
4281     redo A;
4282     } elsif ($self->{nc} == 0x0028) { # (
4283     ## XML5: Same as "anything else".
4284     !!!parse-error (type => 'no space before paren'); ## TODO: type
4285     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4286     !!!next-input-character;
4287     redo A;
4288     } elsif ($self->{nc} == -1) {
4289     ## XML5: No parse error.
4290     !!!parse-error (type => 'unclosed md'); ## TODO: type
4291     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4292     !!!next-input-character;
4293     !!!emit ($self->{ct});
4294     redo A;
4295     } else {
4296     ## XML5: Not defined yet.
4297     $self->{ca}->{type} .= chr $self->{nc};
4298     ## Stay in the state.
4299     !!!next-input-character;
4300     redo A;
4301     }
4302     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4303     if ($is_space->{$self->{nc}}) {
4304     ## Stay in the state.
4305     !!!next-input-character;
4306     redo A;
4307     } elsif ($self->{nc} == 0x0028) { # (
4308     ## XML5: Same as "anything else".
4309     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4310     !!!next-input-character;
4311     redo A;
4312     } elsif ($self->{nc} == 0x0023) { # #
4313     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4314     !!!next-input-character;
4315     redo A;
4316     } elsif ($self->{nc} == 0x0022) { # "
4317     ## XML5: Same as "anything else".
4318     $self->{ca}->{value} = '';
4319     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4320     !!!next-input-character;
4321     redo A;
4322     } elsif ($self->{nc} == 0x0027) { # '
4323     ## XML5: Same as "anything else".
4324     $self->{ca}->{value} = '';
4325     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4326     !!!next-input-character;
4327     redo A;
4328     } elsif ($self->{nc} == 0x003E) { # >
4329     ## XML5: Same as "anything else".
4330     !!!parse-error (type => 'no attr default'); ## TODO: type
4331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332     !!!next-input-character;
4333     !!!emit ($self->{ct}); # ATTLIST
4334     redo A;
4335     } elsif ($self->{nc} == -1) {
4336     ## XML5: No parse error.
4337     !!!parse-error (type => 'unclosed md'); ## TODO: type
4338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4339     !!!next-input-character;
4340     !!!emit ($self->{ct});
4341     redo A;
4342     } else {
4343     ## XML5: Switch to the "DOCTYPE bogus comment state".
4344     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4345     $self->{ca}->{value} = '';
4346     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4347     ## Reconsume.
4348     redo A;
4349     }
4350     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4351     if ($is_space->{$self->{nc}}) {
4352     ## Stay in the state.
4353     !!!next-input-character;
4354     redo A;
4355     } elsif ($self->{nc} == 0x007C) { # |
4356     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4357     ## Stay in the state.
4358     !!!next-input-character;
4359     redo A;
4360     } elsif ($self->{nc} == 0x0029) { # )
4361     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4362     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4363     !!!next-input-character;
4364     redo A;
4365     } elsif ($self->{nc} == 0x003E) { # >
4366     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4367     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4368     !!!next-input-character;
4369     !!!emit ($self->{ct}); # ATTLIST
4370     redo A;
4371     } elsif ($self->{nc} == -1) {
4372     ## XML5: No parse error.
4373     !!!parse-error (type => 'unclosed md'); ## TODO: type
4374     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4375     !!!next-input-character;
4376     !!!emit ($self->{ct});
4377     redo A;
4378     } else {
4379     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4380     $self->{state} = ALLOWED_TOKEN_STATE;
4381     !!!next-input-character;
4382     redo A;
4383     }
4384     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4385     if ($is_space->{$self->{nc}}) {
4386     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4387     !!!next-input-character;
4388     redo A;
4389     } elsif ($self->{nc} == 0x007C) { # |
4390     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4391     !!!next-input-character;
4392     redo A;
4393     } elsif ($self->{nc} == 0x0029) { # )
4394     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4395     !!!next-input-character;
4396     redo A;
4397     } elsif ($self->{nc} == 0x003E) { # >
4398     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4399     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4400     !!!next-input-character;
4401     !!!emit ($self->{ct}); # ATTLIST
4402     redo A;
4403     } elsif ($self->{nc} == -1) {
4404     ## XML5: No parse error.
4405     !!!parse-error (type => 'unclosed md'); ## TODO: type
4406     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4407     !!!next-input-character;
4408     !!!emit ($self->{ct});
4409     redo A;
4410     } else {
4411     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4412     ## Stay in the state.
4413     !!!next-input-character;
4414     redo A;
4415     }
4416     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4417     if ($is_space->{$self->{nc}}) {
4418     ## Stay in the state.
4419     !!!next-input-character;
4420     redo A;
4421     } elsif ($self->{nc} == 0x007C) { # |
4422     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4423     !!!next-input-character;
4424     redo A;
4425     } elsif ($self->{nc} == 0x0029) { # )
4426     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4427     !!!next-input-character;
4428     redo A;
4429     } elsif ($self->{nc} == 0x003E) { # >
4430     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4431     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4432     !!!next-input-character;
4433     !!!emit ($self->{ct}); # ATTLIST
4434     redo A;
4435     } elsif ($self->{nc} == -1) {
4436     ## XML5: No parse error.
4437     !!!parse-error (type => 'unclosed md'); ## TODO: type
4438     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4439     !!!next-input-character;
4440     !!!emit ($self->{ct});
4441     redo A;
4442     } else {
4443     !!!parse-error (type => 'space in allowed token', ## TODO: type
4444     line => $self->{line_prev},
4445     column => $self->{column_prev});
4446     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4447     $self->{state} = ALLOWED_TOKEN_STATE;
4448     !!!next-input-character;
4449     redo A;
4450     }
4451     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4452     if ($is_space->{$self->{nc}}) {
4453     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4454     !!!next-input-character;
4455     redo A;
4456     } elsif ($self->{nc} == 0x0023) { # #
4457     !!!parse-error (type => 'no space before default value'); ## TODO: type
4458     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4459     !!!next-input-character;
4460     redo A;
4461     } elsif ($self->{nc} == 0x0022) { # "
4462     !!!parse-error (type => 'no space before default value'); ## TODO: type
4463     $self->{ca}->{value} = '';
4464     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4465     !!!next-input-character;
4466     redo A;
4467     } elsif ($self->{nc} == 0x0027) { # '
4468     !!!parse-error (type => 'no space before default value'); ## TODO: type
4469     $self->{ca}->{value} = '';
4470     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4471     !!!next-input-character;
4472     redo A;
4473     } elsif ($self->{nc} == 0x003E) { # >
4474     !!!parse-error (type => 'no attr default'); ## TODO: type
4475     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4476     !!!next-input-character;
4477     !!!emit ($self->{ct}); # ATTLIST
4478     redo A;
4479     } elsif ($self->{nc} == -1) {
4480     !!!parse-error (type => 'unclosed md'); ## TODO: type
4481     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4482     !!!next-input-character;
4483     !!!emit ($self->{ct});
4484     redo A;
4485     } else {
4486     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4487     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4488     ## Reconsume.
4489     redo A;
4490     }
4491     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4492     if ($is_space->{$self->{nc}}) {
4493     ## Stay in the state.
4494     !!!next-input-character;
4495     redo A;
4496     } elsif ($self->{nc} == 0x0023) { # #
4497     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4498     !!!next-input-character;
4499     redo A;
4500     } elsif ($self->{nc} == 0x0022) { # "
4501     $self->{ca}->{value} = '';
4502     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4503     !!!next-input-character;
4504     redo A;
4505     } elsif ($self->{nc} == 0x0027) { # '
4506     $self->{ca}->{value} = '';
4507     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4508     !!!next-input-character;
4509     redo A;
4510     } elsif ($self->{nc} == 0x003E) { # >
4511     !!!parse-error (type => 'no attr default'); ## TODO: type
4512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4513     !!!next-input-character;
4514     !!!emit ($self->{ct}); # ATTLIST
4515     redo A;
4516     } elsif ($self->{nc} == -1) {
4517     !!!parse-error (type => 'unclosed md'); ## TODO: type
4518     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4519     !!!next-input-character;
4520     !!!emit ($self->{ct});
4521     redo A;
4522     } else {
4523     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4524     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4525     ## Reconsume.
4526     redo A;
4527     }
4528     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4529     if ($is_space->{$self->{nc}}) {
4530     ## XML5: No parse error.
4531     !!!parse-error (type => 'no default type'); ## TODO: type
4532 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4533 wakaba 1.14 ## Reconsume.
4534     redo A;
4535 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4536     ## XML5: Same as "anything else".
4537     $self->{ca}->{value} = '';
4538     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4539     !!!next-input-character;
4540     redo A;
4541     } elsif ($self->{nc} == 0x0027) { # '
4542     ## XML5: Same as "anything else".
4543     $self->{ca}->{value} = '';
4544     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4545     !!!next-input-character;
4546     redo A;
4547     } elsif ($self->{nc} == 0x003E) { # >
4548     ## XML5: Same as "anything else".
4549     !!!parse-error (type => 'no attr default'); ## TODO: type
4550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4551     !!!next-input-character;
4552     !!!emit ($self->{ct}); # ATTLIST
4553     redo A;
4554     } elsif ($self->{nc} == -1) {
4555     ## XML5: No parse error.
4556     !!!parse-error (type => 'unclosed md'); ## TODO: type
4557     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4558     !!!next-input-character;
4559     !!!emit ($self->{ct});
4560     redo A;
4561     } else {
4562     $self->{ca}->{default} = chr $self->{nc};
4563     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4564     !!!next-input-character;
4565     redo A;
4566 wakaba 1.14 }
4567 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4568     if ($is_space->{$self->{nc}}) {
4569     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4570     !!!next-input-character;
4571     redo A;
4572     } elsif ($self->{nc} == 0x0022) { # "
4573     ## XML5: Same as "anything else".
4574     !!!parse-error (type => 'no space before default value'); ## TODO: type
4575     $self->{ca}->{value} = '';
4576     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4577     !!!next-input-character;
4578     redo A;
4579     } elsif ($self->{nc} == 0x0027) { # '
4580     ## XML5: Same as "anything else".
4581     !!!parse-error (type => 'no space before default value'); ## TODO: type
4582     $self->{ca}->{value} = '';
4583     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4584     !!!next-input-character;
4585     redo A;
4586     } elsif ($self->{nc} == 0x003E) { # >
4587     ## XML5: Same as "anything else".
4588     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4589     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4590     !!!next-input-character;
4591     !!!emit ($self->{ct}); # ATTLIST
4592     redo A;
4593     } elsif ($self->{nc} == -1) {
4594     ## XML5: No parse error.
4595     !!!parse-error (type => 'unclosed md'); ## TODO: type
4596     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4597     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4598     !!!next-input-character;
4599     !!!emit ($self->{ct});
4600     redo A;
4601     } else {
4602     $self->{ca}->{default} .= chr $self->{nc};
4603     ## Stay in the state.
4604     !!!next-input-character;
4605     redo A;
4606     }
4607     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4608     if ($is_space->{$self->{nc}}) {
4609     ## Stay in the state.
4610     !!!next-input-character;
4611     redo A;
4612     } elsif ($self->{nc} == 0x0022) { # "
4613     $self->{ca}->{value} = '';
4614     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4615     !!!next-input-character;
4616     redo A;
4617     } elsif ($self->{nc} == 0x0027) { # '
4618     $self->{ca}->{value} = '';
4619     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4620     !!!next-input-character;
4621     redo A;
4622     } elsif ($self->{nc} == 0x003E) { # >
4623     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4624     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4625     !!!next-input-character;
4626     !!!emit ($self->{ct}); # ATTLIST
4627     redo A;
4628     } elsif ($self->{nc} == -1) {
4629     ## XML5: No parse error.
4630     !!!parse-error (type => 'unclosed md'); ## TODO: type
4631     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4632     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4633     !!!next-input-character;
4634     !!!emit ($self->{ct});
4635     redo A;
4636     } else {
4637     ## XML5: Not defined yet.
4638     if ($self->{ca}->{default} eq 'FIXED') {
4639     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4640     } else {
4641     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4642     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4643     }
4644     ## Reconsume.
4645     redo A;
4646     }
4647     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4648     if ($is_space->{$self->{nc}} or
4649     $self->{nc} == -1 or
4650     $self->{nc} == 0x003E) { # >
4651     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4652     ## Reconsume.
4653     redo A;
4654     } else {
4655     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4656     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4657     ## Reconsume.
4658     redo A;
4659 wakaba 1.16 }
4660 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4661     ## ASCII case-insensitive
4662     if ($self->{nc} == [
4663     undef,
4664     0x0044, # D
4665     0x0041, # A
4666     0x0054, # T
4667     ]->[length $self->{kwd}] or
4668     $self->{nc} == [
4669     undef,
4670     0x0064, # d
4671     0x0061, # a
4672     0x0074, # t
4673     ]->[length $self->{kwd}]) {
4674     !!!cp (172.2);
4675     ## Stay in the state.
4676     $self->{kwd} .= chr $self->{nc};
4677     !!!next-input-character;
4678     redo A;
4679     } elsif ((length $self->{kwd}) == 4 and
4680     ($self->{nc} == 0x0041 or # A
4681     $self->{nc} == 0x0061)) { # a
4682     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4683     !!!cp (172.3);
4684     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4685     text => 'NDATA',
4686     line => $self->{line_prev},
4687     column => $self->{column_prev} - 4);
4688     } else {
4689     !!!cp (172.4);
4690     }
4691     $self->{state} = AFTER_NDATA_STATE;
4692     !!!next-input-character;
4693     redo A;
4694     } else {
4695     !!!parse-error (type => 'string after literal', ## TODO: type
4696     line => $self->{line_prev},
4697     column => $self->{column_prev} + 1
4698     - length $self->{kwd});
4699     !!!cp (172.5);
4700     $self->{state} = BOGUS_MD_STATE;
4701     ## Reconsume.
4702     redo A;
4703     }
4704     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4705     if ($is_space->{$self->{nc}}) {
4706     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4707     !!!next-input-character;
4708     redo A;
4709     } elsif ($self->{nc} == 0x003E) { # >
4710     !!!parse-error (type => 'no notation name'); ## TODO: type
4711     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4712     !!!next-input-character;
4713     !!!emit ($self->{ct}); # ENTITY
4714     redo A;
4715     } elsif ($self->{nc} == -1) {
4716     !!!parse-error (type => 'unclosed md'); ## TODO: type
4717     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4718     !!!next-input-character;
4719     !!!emit ($self->{ct}); # ENTITY
4720     redo A;
4721     } else {
4722     !!!parse-error (type => 'string after literal', ## TODO: type
4723     line => $self->{line_prev},
4724     column => $self->{column_prev} + 1
4725     - length $self->{kwd});
4726     $self->{state} = BOGUS_MD_STATE;
4727     ## Reconsume.
4728     redo A;
4729     }
4730     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4731     if ($is_space->{$self->{nc}}) {
4732     ## Stay in the state.
4733     !!!next-input-character;
4734     redo A;
4735     } elsif ($self->{nc} == 0x003E) { # >
4736     !!!parse-error (type => 'no notation name'); ## TODO: type
4737     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4738     !!!next-input-character;
4739     !!!emit ($self->{ct}); # ENTITY
4740     redo A;
4741     } elsif ($self->{nc} == -1) {
4742     !!!parse-error (type => 'unclosed md'); ## TODO: type
4743     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4744     !!!next-input-character;
4745     !!!emit ($self->{ct}); # ENTITY
4746     redo A;
4747     } else {
4748     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4749     $self->{state} = NOTATION_NAME_STATE;
4750     !!!next-input-character;
4751     redo A;
4752     }
4753     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4754     if ($is_space->{$self->{nc}}) {
4755 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4756 wakaba 1.18 !!!next-input-character;
4757     redo A;
4758     } elsif ($self->{nc} == 0x003E) { # >
4759     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4760     !!!next-input-character;
4761     !!!emit ($self->{ct}); # ENTITY
4762     redo A;
4763     } elsif ($self->{nc} == -1) {
4764     !!!parse-error (type => 'unclosed md'); ## TODO: type
4765     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4766     !!!next-input-character;
4767     !!!emit ($self->{ct}); # ENTITY
4768     redo A;
4769     } else {
4770     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4771     ## Stay in the state.
4772     !!!next-input-character;
4773     redo A;
4774     }
4775 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4776     if ($self->{nc} == 0x0022) { # "
4777 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4778 wakaba 1.19 !!!next-input-character;
4779     redo A;
4780     } elsif ($self->{nc} == 0x0026) { # &
4781     $self->{prev_state} = $self->{state};
4782     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4783     $self->{entity_add} = 0x0022; # "
4784     !!!next-input-character;
4785     redo A;
4786     ## TODO: %
4787     } elsif ($self->{nc} == -1) {
4788     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4789     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4790     ## Reconsume.
4791     !!!emit ($self->{ct}); # ENTITY
4792     redo A;
4793     } else {
4794     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4795     !!!next-input-character;
4796     redo A;
4797     }
4798     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4799     if ($self->{nc} == 0x0027) { # '
4800 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4801 wakaba 1.19 !!!next-input-character;
4802     redo A;
4803     } elsif ($self->{nc} == 0x0026) { # &
4804     $self->{prev_state} = $self->{state};
4805     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4806     $self->{entity_add} = 0x0027; # '
4807     !!!next-input-character;
4808     redo A;
4809     ## TODO: %
4810     } elsif ($self->{nc} == -1) {
4811     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4812     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4813     ## Reconsume.
4814     !!!emit ($self->{ct}); # ENTITY
4815     redo A;
4816     } else {
4817     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4818     !!!next-input-character;
4819     redo A;
4820     }
4821     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4822     if ($is_space->{$self->{nc}} or
4823     {
4824     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4825     $self->{entity_add} => 1,
4826     }->{$self->{nc}}) {
4827 wakaba 1.22 !!!parse-error (type => 'bare ero',
4828     line => $self->{line_prev},
4829     column => $self->{column_prev}
4830     + ($self->{nc} == -1 ? 1 : 0));
4831 wakaba 1.19 ## Don't consume
4832     ## Return nothing.
4833     #
4834     } elsif ($self->{nc} == 0x0023) { # #
4835     $self->{ca} = $self->{ct};
4836     $self->{state} = ENTITY_HASH_STATE;
4837     $self->{kwd} = '#';
4838     !!!next-input-character;
4839     redo A;
4840     } else {
4841     #
4842     }
4843    
4844     $self->{ct}->{value} .= '&';
4845     $self->{state} = $self->{prev_state};
4846     ## Reconsume.
4847     redo A;
4848 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4849     if ($is_space->{$self->{nc}}) {
4850     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4851     !!!next-input-character;
4852     redo A;
4853     } elsif ($self->{nc} == 0x0028) { # (
4854     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4855     $self->{ct}->{content} = ['('];
4856     $self->{group_depth} = 1;
4857     !!!next-input-character;
4858     redo A;
4859     } elsif ($self->{nc} == 0x003E) { # >
4860     !!!parse-error (type => 'no md def'); ## TODO: type
4861     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4862     !!!next-input-character;
4863     !!!emit ($self->{ct}); # ELEMENT
4864     redo A;
4865     } elsif ($self->{nc} == -1) {
4866     !!!parse-error (type => 'unclosed md'); ## TODO: type
4867     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4868     !!!next-input-character;
4869     !!!emit ($self->{ct}); # ELEMENT
4870     redo A;
4871     } else {
4872     $self->{ct}->{content} = [chr $self->{nc}];
4873     $self->{state} = CONTENT_KEYWORD_STATE;
4874     !!!next-input-character;
4875     redo A;
4876     }
4877     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4878     if ($is_space->{$self->{nc}}) {
4879     $self->{state} = AFTER_MD_DEF_STATE;
4880     !!!next-input-character;
4881     redo A;
4882     } elsif ($self->{nc} == 0x003E) { # >
4883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4884     !!!next-input-character;
4885     !!!emit ($self->{ct}); # ELEMENT
4886     redo A;
4887     } elsif ($self->{nc} == -1) {
4888     !!!parse-error (type => 'unclosed md'); ## TODO: type
4889     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4890     !!!next-input-character;
4891     !!!emit ($self->{ct}); # ELEMENT
4892     redo A;
4893     } else {
4894     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4895     ## Stay in the state.
4896     !!!next-input-character;
4897     redo A;
4898     }
4899     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4900     if ($is_space->{$self->{nc}}) {
4901     ## Stay in the state.
4902     !!!next-input-character;
4903     redo A;
4904     } elsif ($self->{nc} == 0x0028) { # (
4905     $self->{group_depth}++;
4906     push @{$self->{ct}->{content}}, chr $self->{nc};
4907     ## Stay in the state.
4908     !!!next-input-character;
4909     redo A;
4910     } elsif ($self->{nc} == 0x007C or # |
4911     $self->{nc} == 0x002C) { # ,
4912     !!!parse-error (type => 'empty element name'); ## TODO: type
4913     ## Stay in the state.
4914     !!!next-input-character;
4915     redo A;
4916     } elsif ($self->{nc} == 0x0029) { # )
4917     !!!parse-error (type => 'empty element name'); ## TODO: type
4918     push @{$self->{ct}->{content}}, chr $self->{nc};
4919     $self->{group_depth}--;
4920     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4921     !!!next-input-character;
4922     redo A;
4923     } elsif ($self->{nc} == 0x003E) { # >
4924     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4925     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4926     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4927     !!!next-input-character;
4928     !!!emit ($self->{ct}); # ELEMENT
4929     redo A;
4930     } elsif ($self->{nc} == -1) {
4931     !!!parse-error (type => 'unclosed md'); ## TODO: type
4932     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4934     !!!next-input-character;
4935     !!!emit ($self->{ct}); # ELEMENT
4936     redo A;
4937     } else {
4938     push @{$self->{ct}->{content}}, chr $self->{nc};
4939     $self->{state} = CM_ELEMENT_NAME_STATE;
4940     !!!next-input-character;
4941     redo A;
4942     }
4943     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4944     if ($is_space->{$self->{nc}}) {
4945     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4946     !!!next-input-character;
4947     redo A;
4948     } elsif ($self->{nc} == 0x002A or # *
4949     $self->{nc} == 0x002B or # +
4950     $self->{nc} == 0x003F) { # ?
4951     push @{$self->{ct}->{content}}, chr $self->{nc};
4952     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4953     !!!next-input-character;
4954     redo A;
4955     } elsif ($self->{nc} == 0x007C or # |
4956     $self->{nc} == 0x002C) { # ,
4957     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4958     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4959     !!!next-input-character;
4960     redo A;
4961     } elsif ($self->{nc} == 0x0029) { # )
4962     $self->{group_depth}--;
4963     push @{$self->{ct}->{content}}, chr $self->{nc};
4964     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4965     !!!next-input-character;
4966     redo A;
4967     } elsif ($self->{nc} == 0x003E) { # >
4968     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4969     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4970     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4971     !!!next-input-character;
4972     !!!emit ($self->{ct}); # ELEMENT
4973     redo A;
4974     } elsif ($self->{nc} == -1) {
4975     !!!parse-error (type => 'unclosed md'); ## TODO: type
4976     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4977     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4978     !!!next-input-character;
4979     !!!emit ($self->{ct}); # ELEMENT
4980     redo A;
4981     } else {
4982     $self->{ct}->{content}->[-1] .= chr $self->{nc};
4983     ## Stay in the state.
4984     !!!next-input-character;
4985     redo A;
4986     }
4987     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4988     if ($is_space->{$self->{nc}}) {
4989     ## Stay in the state.
4990     !!!next-input-character;
4991     redo A;
4992     } elsif ($self->{nc} == 0x007C or # |
4993     $self->{nc} == 0x002C) { # ,
4994     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4995     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4996     !!!next-input-character;
4997     redo A;
4998     } elsif ($self->{nc} == 0x0029) { # )
4999     $self->{group_depth}--;
5000     push @{$self->{ct}->{content}}, chr $self->{nc};
5001     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5002     !!!next-input-character;
5003     redo A;
5004     } elsif ($self->{nc} == 0x003E) { # >
5005     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5006     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5007     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5008     !!!next-input-character;
5009     !!!emit ($self->{ct}); # ELEMENT
5010     redo A;
5011     } elsif ($self->{nc} == -1) {
5012     !!!parse-error (type => 'unclosed md'); ## TODO: type
5013     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5014     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5015     !!!next-input-character;
5016     !!!emit ($self->{ct}); # ELEMENT
5017     redo A;
5018     } else {
5019     !!!parse-error (type => 'after element name'); ## TODO: type
5020     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5021     $self->{state} = BOGUS_MD_STATE;
5022     !!!next-input-character;
5023     redo A;
5024     }
5025     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5026     if ($is_space->{$self->{nc}}) {
5027     if ($self->{group_depth}) {
5028     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5029     } else {
5030     $self->{state} = AFTER_MD_DEF_STATE;
5031     }
5032     !!!next-input-character;
5033     redo A;
5034     } elsif ($self->{nc} == 0x002A or # *
5035     $self->{nc} == 0x002B or # +
5036     $self->{nc} == 0x003F) { # ?
5037     push @{$self->{ct}->{content}}, chr $self->{nc};
5038     if ($self->{group_depth}) {
5039     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5040     } else {
5041     $self->{state} = AFTER_MD_DEF_STATE;
5042     }
5043     !!!next-input-character;
5044     redo A;
5045     } elsif ($self->{nc} == 0x0029) { # )
5046     if ($self->{group_depth}) {
5047     $self->{group_depth}--;
5048     push @{$self->{ct}->{content}}, chr $self->{nc};
5049     ## Stay in the state.
5050     !!!next-input-character;
5051     redo A;
5052     } else {
5053     !!!parse-error (type => 'string after md def'); ## TODO: type
5054     $self->{state} = BOGUS_MD_STATE;
5055     ## Reconsume.
5056     redo A;
5057     }
5058     } elsif ($self->{nc} == 0x003E) { # >
5059     if ($self->{group_depth}) {
5060     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5061     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5062     }
5063     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5064     !!!next-input-character;
5065     !!!emit ($self->{ct}); # ELEMENT
5066     redo A;
5067     } elsif ($self->{nc} == -1) {
5068     !!!parse-error (type => 'unclosed md'); ## TODO: type
5069     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5070     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5071     !!!next-input-character;
5072     !!!emit ($self->{ct}); # ELEMENT
5073     redo A;
5074     } else {
5075     if ($self->{group_depth}) {
5076     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5077     } else {
5078     !!!parse-error (type => 'string after md def'); ## TODO: type
5079     $self->{state} = BOGUS_MD_STATE;
5080     }
5081     ## Reconsume.
5082     redo A;
5083     }
5084     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5085 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5086     ## Stay in the state.
5087     !!!next-input-character;
5088     redo A;
5089     } elsif ($self->{nc} == 0x003E) { # >
5090     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5091     !!!next-input-character;
5092 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5093 wakaba 1.18 redo A;
5094     } elsif ($self->{nc} == -1) {
5095     !!!parse-error (type => 'unclosed md'); ## TODO: type
5096     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5097     !!!next-input-character;
5098 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5099 wakaba 1.18 redo A;
5100     } else {
5101 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5102 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5103     ## Reconsume.
5104     redo A;
5105     }
5106 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5107     if ($self->{nc} == 0x003E) { # >
5108     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5109     !!!next-input-character;
5110     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5111     redo A;
5112     } elsif ($self->{nc} == -1) {
5113     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5114     ## Reconsume.
5115     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5116     redo A;
5117     } else {
5118     ## Stay in the state.
5119     !!!next-input-character;
5120     redo A;
5121     }
5122 wakaba 1.1 } else {
5123     die "$0: $self->{state}: Unknown state";
5124     }
5125     } # A
5126    
5127     die "$0: _get_next_token: unexpected case";
5128     } # _get_next_token
5129    
5130     1;
5131 wakaba 1.26 ## $Date: 2008/10/19 15:17:01 $
5132 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24