/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.33 - (hide annotations) (download) (as text)
Sat Sep 5 10:41:07 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.32: +52 -17 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 10:40:03 -0000
	* tokenizer-test-1.test: Updated test results on unclosed start
	and end tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	5 Sep 2009 10:40:48 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Updated test results on unclosed
	tags and attlist declarations (cf. HTML5 revision 2990).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 10:39:09 -0000
	* Tokenizer.pm.src: Discard unclosed tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.33 our $VERSION=do{my @r=(q$Revision: 1.32 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209     0x0D => 0x000A,
210     0x80 => 0x20AC,
211     0x81 => 0xFFFD,
212     0x82 => 0x201A,
213     0x83 => 0x0192,
214     0x84 => 0x201E,
215     0x85 => 0x2026,
216     0x86 => 0x2020,
217     0x87 => 0x2021,
218     0x88 => 0x02C6,
219     0x89 => 0x2030,
220     0x8A => 0x0160,
221     0x8B => 0x2039,
222     0x8C => 0x0152,
223     0x8D => 0xFFFD,
224     0x8E => 0x017D,
225     0x8F => 0xFFFD,
226     0x90 => 0xFFFD,
227     0x91 => 0x2018,
228     0x92 => 0x2019,
229     0x93 => 0x201C,
230     0x94 => 0x201D,
231     0x95 => 0x2022,
232     0x96 => 0x2013,
233     0x97 => 0x2014,
234     0x98 => 0x02DC,
235     0x99 => 0x2122,
236     0x9A => 0x0161,
237     0x9B => 0x203A,
238     0x9C => 0x0153,
239     0x9D => 0xFFFD,
240     0x9E => 0x017E,
241     0x9F => 0x0178,
242     }; # $charref_map
243     $charref_map->{$_} = 0xFFFD
244     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251    
252     ## Implementations MUST act as if state machine in the spec
253    
254     sub _initialize_tokenizer ($) {
255     my $self = shift;
256    
257     ## NOTE: Fields set by |new| constructor:
258     #$self->{level}
259     #$self->{set_nc}
260     #$self->{parse_error}
261 wakaba 1.3 #$self->{is_xml} (if XML)
262 wakaba 1.1
263     $self->{state} = DATA_STATE; # MUST
264 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
265     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 wakaba 1.1 #$self->{entity__value}; # initialized when used
267     #$self->{entity__match}; # initialized when used
268     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269     undef $self->{ct}; # current token
270     undef $self->{ca}; # current attribute
271     undef $self->{last_stag_name}; # last emitted start tag name
272     #$self->{prev_state}; # initialized when used
273     delete $self->{self_closing};
274     $self->{char_buffer} = '';
275     $self->{char_buffer_pos} = 0;
276     $self->{nc} = -1; # next input character
277     #$self->{next_nc}
278     !!!next-input-character;
279     $self->{token} = [];
280     # $self->{escape}
281     } # _initialize_tokenizer
282    
283     ## A token has:
284     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
285 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
286 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
287     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
288 wakaba 1.11 ## ->{target} (PI_TOKEN)
289 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
290     ## ->{sysid} (DOCTYPE_TOKEN)
291     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
292     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
293     ## ->{name}
294     ## ->{value}
295     ## ->{has_reference} == 1 or 0
296 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
297     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
298 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
299 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
300 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
301    
302 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
303     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
304     ## while the token is pushed back to the stack.
305    
306     ## Emitted token MUST immediately be handled by the tree construction state.
307    
308     ## Before each step, UA MAY check to see if either one of the scripts in
309     ## "list of scripts that will execute as soon as possible" or the first
310     ## script in the "list of scripts that will execute asynchronously",
311     ## has completed loading. If one has, then it MUST be executed
312     ## and removed from the list.
313    
314     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
315     ## (This requirement was dropped from HTML5 spec, unfortunately.)
316    
317     my $is_space = {
318     0x0009 => 1, # CHARACTER TABULATION (HT)
319     0x000A => 1, # LINE FEED (LF)
320     #0x000B => 0, # LINE TABULATION (VT)
321 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
322 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
323     0x0020 => 1, # SPACE (SP)
324     };
325    
326     sub _get_next_token ($) {
327     my $self = shift;
328    
329     if ($self->{self_closing}) {
330     !!!parse-error (type => 'nestc', token => $self->{ct});
331     ## NOTE: The |self_closing| flag is only set by start tag token.
332     ## In addition, when a start tag token is emitted, it is always set to
333     ## |ct|.
334     delete $self->{self_closing};
335     }
336    
337     if (@{$self->{token}}) {
338     $self->{self_closing} = $self->{token}->[0]->{self_closing};
339     return shift @{$self->{token}};
340     }
341    
342     A: {
343     if ($self->{state} == PCDATA_STATE) {
344     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
345    
346     if ($self->{nc} == 0x0026) { # &
347     !!!cp (0.1);
348     ## NOTE: In the spec, the tokenizer is switched to the
349     ## "entity data state". In this implementation, the tokenizer
350     ## is switched to the |ENTITY_STATE|, which is an implementation
351     ## of the "consume a character reference" algorithm.
352     $self->{entity_add} = -1;
353     $self->{prev_state} = DATA_STATE;
354     $self->{state} = ENTITY_STATE;
355     !!!next-input-character;
356     redo A;
357     } elsif ($self->{nc} == 0x003C) { # <
358     !!!cp (0.2);
359     $self->{state} = TAG_OPEN_STATE;
360     !!!next-input-character;
361     redo A;
362     } elsif ($self->{nc} == -1) {
363     !!!cp (0.3);
364     !!!emit ({type => END_OF_FILE_TOKEN,
365     line => $self->{line}, column => $self->{column}});
366     last A; ## TODO: ok?
367     } else {
368     !!!cp (0.4);
369     #
370     }
371    
372     # Anything else
373     my $token = {type => CHARACTER_TOKEN,
374     data => chr $self->{nc},
375     line => $self->{line}, column => $self->{column},
376     };
377     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
378    
379     ## Stay in the state.
380     !!!next-input-character;
381     !!!emit ($token);
382     redo A;
383     } elsif ($self->{state} == DATA_STATE) {
384     $self->{s_kwd} = '' unless defined $self->{s_kwd};
385     if ($self->{nc} == 0x0026) { # &
386     $self->{s_kwd} = '';
387     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
388     not $self->{escape}) {
389     !!!cp (1);
390     ## NOTE: In the spec, the tokenizer is switched to the
391     ## "entity data state". In this implementation, the tokenizer
392     ## is switched to the |ENTITY_STATE|, which is an implementation
393     ## of the "consume a character reference" algorithm.
394     $self->{entity_add} = -1;
395     $self->{prev_state} = DATA_STATE;
396     $self->{state} = ENTITY_STATE;
397     !!!next-input-character;
398     redo A;
399     } else {
400     !!!cp (2);
401     #
402     }
403     } elsif ($self->{nc} == 0x002D) { # -
404     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
405 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
406 wakaba 1.1 !!!cp (3);
407     $self->{escape} = 1; # unless $self->{escape};
408     $self->{s_kwd} = '--';
409     #
410 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
411 wakaba 1.1 !!!cp (4);
412     $self->{s_kwd} = '--';
413     #
414 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
415     !!!cp (4.1);
416     $self->{s_kwd} .= '-';
417     #
418 wakaba 1.1 } else {
419     !!!cp (5);
420 wakaba 1.5 $self->{s_kwd} = '-';
421 wakaba 1.1 #
422     }
423     }
424    
425     #
426     } elsif ($self->{nc} == 0x0021) { # !
427     if (length $self->{s_kwd}) {
428     !!!cp (5.1);
429     $self->{s_kwd} .= '!';
430     #
431     } else {
432     !!!cp (5.2);
433     #$self->{s_kwd} = '';
434     #
435     }
436     #
437     } elsif ($self->{nc} == 0x003C) { # <
438     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
439     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
440     not $self->{escape})) {
441     !!!cp (6);
442     $self->{state} = TAG_OPEN_STATE;
443     !!!next-input-character;
444     redo A;
445     } else {
446     !!!cp (7);
447     $self->{s_kwd} = '';
448     #
449     }
450     } elsif ($self->{nc} == 0x003E) { # >
451     if ($self->{escape} and
452     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
453     if ($self->{s_kwd} eq '--') {
454     !!!cp (8);
455     delete $self->{escape};
456 wakaba 1.5 #
457 wakaba 1.1 } else {
458     !!!cp (9);
459 wakaba 1.5 #
460 wakaba 1.1 }
461 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
462     !!!cp (9.1);
463     !!!parse-error (type => 'unmatched mse', ## TODO: type
464     line => $self->{line_prev},
465     column => $self->{column_prev} - 1);
466     #
467 wakaba 1.1 } else {
468     !!!cp (10);
469 wakaba 1.5 #
470 wakaba 1.1 }
471    
472     $self->{s_kwd} = '';
473     #
474 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
475     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
476     !!!cp (10.1);
477     $self->{s_kwd} .= ']';
478     } elsif ($self->{s_kwd} eq ']]') {
479     !!!cp (10.2);
480     #
481     } else {
482     !!!cp (10.3);
483     $self->{s_kwd} = '';
484     }
485     #
486 wakaba 1.1 } elsif ($self->{nc} == -1) {
487     !!!cp (11);
488     $self->{s_kwd} = '';
489     !!!emit ({type => END_OF_FILE_TOKEN,
490     line => $self->{line}, column => $self->{column}});
491     last A; ## TODO: ok?
492     } else {
493     !!!cp (12);
494     $self->{s_kwd} = '';
495     #
496     }
497    
498     # Anything else
499     my $token = {type => CHARACTER_TOKEN,
500     data => chr $self->{nc},
501     line => $self->{line}, column => $self->{column},
502     };
503 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
504 wakaba 1.1 length $token->{data})) {
505     $self->{s_kwd} = '';
506     }
507    
508     ## Stay in the data state.
509 wakaba 1.5 if (not $self->{is_xml} and
510     $self->{content_model} == PCDATA_CONTENT_MODEL) {
511 wakaba 1.1 !!!cp (13);
512     $self->{state} = PCDATA_STATE;
513     } else {
514     !!!cp (14);
515     ## Stay in the state.
516     }
517     !!!next-input-character;
518     !!!emit ($token);
519     redo A;
520     } elsif ($self->{state} == TAG_OPEN_STATE) {
521 wakaba 1.10 ## XML5: "tag state".
522    
523 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
524     if ($self->{nc} == 0x002F) { # /
525     !!!cp (15);
526     !!!next-input-character;
527     $self->{state} = CLOSE_TAG_OPEN_STATE;
528     redo A;
529     } elsif ($self->{nc} == 0x0021) { # !
530     !!!cp (15.1);
531 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
532 wakaba 1.1 #
533     } else {
534     !!!cp (16);
535 wakaba 1.12 $self->{s_kwd} = '';
536 wakaba 1.1 #
537     }
538    
539     ## reconsume
540     $self->{state} = DATA_STATE;
541     !!!emit ({type => CHARACTER_TOKEN, data => '<',
542     line => $self->{line_prev},
543     column => $self->{column_prev},
544     });
545     redo A;
546     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
547     if ($self->{nc} == 0x0021) { # !
548     !!!cp (17);
549     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
550     !!!next-input-character;
551     redo A;
552     } elsif ($self->{nc} == 0x002F) { # /
553     !!!cp (18);
554     $self->{state} = CLOSE_TAG_OPEN_STATE;
555     !!!next-input-character;
556     redo A;
557     } elsif (0x0041 <= $self->{nc} and
558     $self->{nc} <= 0x005A) { # A..Z
559     !!!cp (19);
560     $self->{ct}
561     = {type => START_TAG_TOKEN,
562 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 wakaba 1.1 line => $self->{line_prev},
564     column => $self->{column_prev}};
565     $self->{state} = TAG_NAME_STATE;
566     !!!next-input-character;
567     redo A;
568     } elsif (0x0061 <= $self->{nc} and
569     $self->{nc} <= 0x007A) { # a..z
570     !!!cp (20);
571     $self->{ct} = {type => START_TAG_TOKEN,
572     tag_name => chr ($self->{nc}),
573     line => $self->{line_prev},
574     column => $self->{column_prev}};
575     $self->{state} = TAG_NAME_STATE;
576     !!!next-input-character;
577     redo A;
578     } elsif ($self->{nc} == 0x003E) { # >
579     !!!cp (21);
580     !!!parse-error (type => 'empty start tag',
581     line => $self->{line_prev},
582     column => $self->{column_prev});
583     $self->{state} = DATA_STATE;
584 wakaba 1.5 $self->{s_kwd} = '';
585 wakaba 1.1 !!!next-input-character;
586    
587     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
588     line => $self->{line_prev},
589     column => $self->{column_prev},
590     });
591    
592     redo A;
593     } elsif ($self->{nc} == 0x003F) { # ?
594 wakaba 1.8 if ($self->{is_xml}) {
595     !!!cp (22.1);
596     $self->{state} = PI_STATE;
597     !!!next-input-character;
598     redo A;
599     } else {
600     !!!cp (22);
601     !!!parse-error (type => 'pio',
602     line => $self->{line_prev},
603     column => $self->{column_prev});
604     $self->{state} = BOGUS_COMMENT_STATE;
605     $self->{ct} = {type => COMMENT_TOKEN, data => '',
606     line => $self->{line_prev},
607     column => $self->{column_prev},
608     };
609     ## $self->{nc} is intentionally left as is
610     redo A;
611     }
612 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
613 wakaba 1.1 !!!cp (23);
614     !!!parse-error (type => 'bare stago',
615     line => $self->{line_prev},
616     column => $self->{column_prev});
617     $self->{state} = DATA_STATE;
618 wakaba 1.5 $self->{s_kwd} = '';
619 wakaba 1.1 ## reconsume
620    
621     !!!emit ({type => CHARACTER_TOKEN, data => '<',
622     line => $self->{line_prev},
623     column => $self->{column_prev},
624     });
625    
626     redo A;
627 wakaba 1.9 } else {
628     ## XML5: "<:" is a parse error.
629     !!!cp (23.1);
630     $self->{ct} = {type => START_TAG_TOKEN,
631     tag_name => chr ($self->{nc}),
632     line => $self->{line_prev},
633     column => $self->{column_prev}};
634     $self->{state} = TAG_NAME_STATE;
635     !!!next-input-character;
636     redo A;
637 wakaba 1.1 }
638     } else {
639     die "$0: $self->{content_model} in tag open";
640     }
641     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
642     ## NOTE: The "close tag open state" in the spec is implemented as
643     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
644    
645 wakaba 1.10 ## XML5: "end tag state".
646    
647 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
648     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
649     if (defined $self->{last_stag_name}) {
650     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
651 wakaba 1.12 $self->{kwd} = '';
652 wakaba 1.1 ## Reconsume.
653     redo A;
654     } else {
655     ## No start tag token has ever been emitted
656     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
657     !!!cp (28);
658     $self->{state} = DATA_STATE;
659 wakaba 1.5 $self->{s_kwd} = '';
660 wakaba 1.1 ## Reconsume.
661     !!!emit ({type => CHARACTER_TOKEN, data => '</',
662     line => $l, column => $c,
663     });
664     redo A;
665     }
666     }
667    
668     if (0x0041 <= $self->{nc} and
669     $self->{nc} <= 0x005A) { # A..Z
670     !!!cp (29);
671     $self->{ct}
672     = {type => END_TAG_TOKEN,
673 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
674 wakaba 1.1 line => $l, column => $c};
675     $self->{state} = TAG_NAME_STATE;
676     !!!next-input-character;
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680     !!!cp (30);
681     $self->{ct} = {type => END_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $l, column => $c};
684     $self->{state} = TAG_NAME_STATE;
685     !!!next-input-character;
686     redo A;
687     } elsif ($self->{nc} == 0x003E) { # >
688     !!!parse-error (type => 'empty end tag',
689     line => $self->{line_prev}, ## "<" in "</>"
690     column => $self->{column_prev} - 1);
691     $self->{state} = DATA_STATE;
692 wakaba 1.5 $self->{s_kwd} = '';
693 wakaba 1.10 if ($self->{is_xml}) {
694     !!!cp (31);
695     ## XML5: No parse error.
696    
697     ## NOTE: This parser raises a parse error, since it supports
698     ## XML1, not XML5.
699    
700     ## NOTE: A short end tag token.
701     my $ct = {type => END_TAG_TOKEN,
702     tag_name => '',
703     line => $self->{line_prev},
704     column => $self->{column_prev} - 1,
705     };
706     !!!next-input-character;
707     !!!emit ($ct);
708     } else {
709     !!!cp (31.1);
710     !!!next-input-character;
711     }
712 wakaba 1.1 redo A;
713     } elsif ($self->{nc} == -1) {
714     !!!cp (32);
715     !!!parse-error (type => 'bare etago');
716 wakaba 1.5 $self->{s_kwd} = '';
717 wakaba 1.1 $self->{state} = DATA_STATE;
718     # reconsume
719    
720     !!!emit ({type => CHARACTER_TOKEN, data => '</',
721     line => $l, column => $c,
722     });
723    
724     redo A;
725 wakaba 1.10 } elsif (not $self->{is_xml} or
726     $is_space->{$self->{nc}}) {
727 wakaba 1.1 !!!cp (33);
728 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
729     line => $self->{line_prev}, # "<" of "</"
730     column => $self->{column_prev} - 1);
731 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
732     $self->{ct} = {type => COMMENT_TOKEN, data => '',
733     line => $self->{line_prev}, # "<" of "</"
734     column => $self->{column_prev} - 1,
735     };
736     ## NOTE: $self->{nc} is intentionally left as is.
737     ## Although the "anything else" case of the spec not explicitly
738     ## states that the next input character is to be reconsumed,
739     ## it will be included to the |data| of the comment token
740     ## generated from the bogus end tag, as defined in the
741     ## "bogus comment state" entry.
742     redo A;
743 wakaba 1.10 } else {
744     ## XML5: "</:" is a parse error.
745     !!!cp (30.1);
746     $self->{ct} = {type => END_TAG_TOKEN,
747     tag_name => chr ($self->{nc}),
748     line => $l, column => $c};
749     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
750     !!!next-input-character;
751     redo A;
752 wakaba 1.1 }
753     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
754 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
755 wakaba 1.1 if (length $ch) {
756     my $CH = $ch;
757     $ch =~ tr/a-z/A-Z/;
758     my $nch = chr $self->{nc};
759     if ($nch eq $ch or $nch eq $CH) {
760     !!!cp (24);
761     ## Stay in the state.
762 wakaba 1.12 $self->{kwd} .= $nch;
763 wakaba 1.1 !!!next-input-character;
764     redo A;
765     } else {
766     !!!cp (25);
767     $self->{state} = DATA_STATE;
768 wakaba 1.5 $self->{s_kwd} = '';
769 wakaba 1.1 ## Reconsume.
770     !!!emit ({type => CHARACTER_TOKEN,
771 wakaba 1.12 data => '</' . $self->{kwd},
772 wakaba 1.1 line => $self->{line_prev},
773 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
774 wakaba 1.1 });
775     redo A;
776     }
777     } else { # after "<{tag-name}"
778     unless ($is_space->{$self->{nc}} or
779     {
780     0x003E => 1, # >
781     0x002F => 1, # /
782     -1 => 1, # EOF
783     }->{$self->{nc}}) {
784     !!!cp (26);
785     ## Reconsume.
786     $self->{state} = DATA_STATE;
787 wakaba 1.5 $self->{s_kwd} = '';
788 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
789 wakaba 1.12 data => '</' . $self->{kwd},
790 wakaba 1.1 line => $self->{line_prev},
791 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
792 wakaba 1.1 });
793     redo A;
794     } else {
795     !!!cp (27);
796     $self->{ct}
797     = {type => END_TAG_TOKEN,
798     tag_name => $self->{last_stag_name},
799     line => $self->{line_prev},
800 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
801 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
802     ## Reconsume.
803     redo A;
804     }
805     }
806     } elsif ($self->{state} == TAG_NAME_STATE) {
807     if ($is_space->{$self->{nc}}) {
808     !!!cp (34);
809     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
810     !!!next-input-character;
811     redo A;
812     } elsif ($self->{nc} == 0x003E) { # >
813     if ($self->{ct}->{type} == START_TAG_TOKEN) {
814     !!!cp (35);
815     $self->{last_stag_name} = $self->{ct}->{tag_name};
816     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
817     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
818     #if ($self->{ct}->{attributes}) {
819     # ## NOTE: This should never be reached.
820     # !!! cp (36);
821     # !!! parse-error (type => 'end tag attribute');
822     #} else {
823     !!!cp (37);
824     #}
825     } else {
826     die "$0: $self->{ct}->{type}: Unknown token type";
827     }
828     $self->{state} = DATA_STATE;
829 wakaba 1.5 $self->{s_kwd} = '';
830 wakaba 1.1 !!!next-input-character;
831    
832     !!!emit ($self->{ct}); # start tag or end tag
833    
834     redo A;
835     } elsif (0x0041 <= $self->{nc} and
836     $self->{nc} <= 0x005A) { # A..Z
837     !!!cp (38);
838 wakaba 1.4 $self->{ct}->{tag_name}
839     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840 wakaba 1.1 # start tag or end tag
841     ## Stay in this state
842     !!!next-input-character;
843     redo A;
844     } elsif ($self->{nc} == -1) {
845     !!!parse-error (type => 'unclosed tag');
846     if ($self->{ct}->{type} == START_TAG_TOKEN) {
847     !!!cp (39);
848     $self->{last_stag_name} = $self->{ct}->{tag_name};
849     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
850     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
851     #if ($self->{ct}->{attributes}) {
852     # ## NOTE: This state should never be reached.
853     # !!! cp (40);
854     # !!! parse-error (type => 'end tag attribute');
855     #} else {
856     !!!cp (41);
857     #}
858     } else {
859     die "$0: $self->{ct}->{type}: Unknown token type";
860     }
861     $self->{state} = DATA_STATE;
862 wakaba 1.5 $self->{s_kwd} = '';
863 wakaba 1.1 # reconsume
864    
865 wakaba 1.33 ## Discard the token.
866     #!!!emit ($self->{ct}); # start tag or end tag
867 wakaba 1.1
868     redo A;
869     } elsif ($self->{nc} == 0x002F) { # /
870     !!!cp (42);
871     $self->{state} = SELF_CLOSING_START_TAG_STATE;
872     !!!next-input-character;
873     redo A;
874     } else {
875     !!!cp (44);
876     $self->{ct}->{tag_name} .= chr $self->{nc};
877     # start tag or end tag
878     ## Stay in the state
879     !!!next-input-character;
880     redo A;
881     }
882     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
883 wakaba 1.11 ## XML5: "Tag attribute name before state".
884    
885 wakaba 1.1 if ($is_space->{$self->{nc}}) {
886     !!!cp (45);
887     ## Stay in the state
888     !!!next-input-character;
889     redo A;
890     } elsif ($self->{nc} == 0x003E) { # >
891     if ($self->{ct}->{type} == START_TAG_TOKEN) {
892     !!!cp (46);
893     $self->{last_stag_name} = $self->{ct}->{tag_name};
894     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
895     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
896     if ($self->{ct}->{attributes}) {
897     !!!cp (47);
898     !!!parse-error (type => 'end tag attribute');
899     } else {
900     !!!cp (48);
901     }
902     } else {
903     die "$0: $self->{ct}->{type}: Unknown token type";
904     }
905     $self->{state} = DATA_STATE;
906 wakaba 1.5 $self->{s_kwd} = '';
907 wakaba 1.1 !!!next-input-character;
908    
909     !!!emit ($self->{ct}); # start tag or end tag
910    
911     redo A;
912     } elsif (0x0041 <= $self->{nc} and
913     $self->{nc} <= 0x005A) { # A..Z
914     !!!cp (49);
915     $self->{ca}
916 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
917 wakaba 1.1 value => '',
918     line => $self->{line}, column => $self->{column}};
919     $self->{state} = ATTRIBUTE_NAME_STATE;
920     !!!next-input-character;
921     redo A;
922     } elsif ($self->{nc} == 0x002F) { # /
923     !!!cp (50);
924     $self->{state} = SELF_CLOSING_START_TAG_STATE;
925     !!!next-input-character;
926     redo A;
927     } elsif ($self->{nc} == -1) {
928     !!!parse-error (type => 'unclosed tag');
929     if ($self->{ct}->{type} == START_TAG_TOKEN) {
930     !!!cp (52);
931     $self->{last_stag_name} = $self->{ct}->{tag_name};
932     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
933     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
934     if ($self->{ct}->{attributes}) {
935     !!!cp (53);
936     !!!parse-error (type => 'end tag attribute');
937     } else {
938     !!!cp (54);
939     }
940     } else {
941     die "$0: $self->{ct}->{type}: Unknown token type";
942     }
943     $self->{state} = DATA_STATE;
944 wakaba 1.5 $self->{s_kwd} = '';
945 wakaba 1.1 # reconsume
946    
947 wakaba 1.33 ## Discard the token.
948     #!!!emit ($self->{ct}); # start tag or end tag
949 wakaba 1.1
950     redo A;
951     } else {
952     if ({
953     0x0022 => 1, # "
954     0x0027 => 1, # '
955 wakaba 1.30 0x003C => 1, # <
956 wakaba 1.1 0x003D => 1, # =
957     }->{$self->{nc}}) {
958     !!!cp (55);
959 wakaba 1.11 ## XML5: Not a parse error.
960 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
961     } else {
962     !!!cp (56);
963 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
964 wakaba 1.1 }
965     $self->{ca}
966     = {name => chr ($self->{nc}),
967     value => '',
968     line => $self->{line}, column => $self->{column}};
969     $self->{state} = ATTRIBUTE_NAME_STATE;
970     !!!next-input-character;
971     redo A;
972     }
973     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
974 wakaba 1.11 ## XML5: "Tag attribute name state".
975    
976 wakaba 1.1 my $before_leave = sub {
977     if (exists $self->{ct}->{attributes} # start tag or end tag
978     ->{$self->{ca}->{name}}) { # MUST
979     !!!cp (57);
980     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
981     ## Discard $self->{ca} # MUST
982     } else {
983     !!!cp (58);
984     $self->{ct}->{attributes}->{$self->{ca}->{name}}
985     = $self->{ca};
986 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
987 wakaba 1.1 }
988     }; # $before_leave
989    
990     if ($is_space->{$self->{nc}}) {
991     !!!cp (59);
992     $before_leave->();
993     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
994     !!!next-input-character;
995     redo A;
996     } elsif ($self->{nc} == 0x003D) { # =
997     !!!cp (60);
998     $before_leave->();
999     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1000     !!!next-input-character;
1001     redo A;
1002     } elsif ($self->{nc} == 0x003E) { # >
1003 wakaba 1.11 if ($self->{is_xml}) {
1004     !!!cp (60.1);
1005     ## XML5: Not a parse error.
1006     !!!parse-error (type => 'no attr value'); ## TODO: type
1007     } else {
1008     !!!cp (60.2);
1009     }
1010    
1011 wakaba 1.1 $before_leave->();
1012     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1013     !!!cp (61);
1014     $self->{last_stag_name} = $self->{ct}->{tag_name};
1015     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1016     !!!cp (62);
1017     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1018     if ($self->{ct}->{attributes}) {
1019     !!!parse-error (type => 'end tag attribute');
1020     }
1021     } else {
1022     die "$0: $self->{ct}->{type}: Unknown token type";
1023     }
1024     $self->{state} = DATA_STATE;
1025 wakaba 1.5 $self->{s_kwd} = '';
1026 wakaba 1.1 !!!next-input-character;
1027    
1028     !!!emit ($self->{ct}); # start tag or end tag
1029    
1030     redo A;
1031     } elsif (0x0041 <= $self->{nc} and
1032     $self->{nc} <= 0x005A) { # A..Z
1033     !!!cp (63);
1034 wakaba 1.4 $self->{ca}->{name}
1035     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1036 wakaba 1.1 ## Stay in the state
1037     !!!next-input-character;
1038     redo A;
1039     } elsif ($self->{nc} == 0x002F) { # /
1040 wakaba 1.11 if ($self->{is_xml}) {
1041     !!!cp (64);
1042     ## XML5: Not a parse error.
1043     !!!parse-error (type => 'no attr value'); ## TODO: type
1044     } else {
1045     !!!cp (64.1);
1046     }
1047    
1048 wakaba 1.1 $before_leave->();
1049     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1050     !!!next-input-character;
1051     redo A;
1052     } elsif ($self->{nc} == -1) {
1053     !!!parse-error (type => 'unclosed tag');
1054     $before_leave->();
1055     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1056     !!!cp (66);
1057     $self->{last_stag_name} = $self->{ct}->{tag_name};
1058     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1059     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1060     if ($self->{ct}->{attributes}) {
1061     !!!cp (67);
1062     !!!parse-error (type => 'end tag attribute');
1063     } else {
1064     ## NOTE: This state should never be reached.
1065     !!!cp (68);
1066     }
1067     } else {
1068     die "$0: $self->{ct}->{type}: Unknown token type";
1069     }
1070     $self->{state} = DATA_STATE;
1071 wakaba 1.5 $self->{s_kwd} = '';
1072 wakaba 1.1 # reconsume
1073    
1074 wakaba 1.33 ## Discard the token.
1075     #!!!emit ($self->{ct}); # start tag or end tag
1076 wakaba 1.1
1077     redo A;
1078     } else {
1079 wakaba 1.30 if ({
1080     0x0022 => 1, # "
1081     0x0027 => 1, # '
1082     0x003C => 1, # <
1083     }->{$self->{nc}}) {
1084 wakaba 1.1 !!!cp (69);
1085 wakaba 1.11 ## XML5: Not a parse error.
1086 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1087     } else {
1088     !!!cp (70);
1089     }
1090     $self->{ca}->{name} .= chr ($self->{nc});
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     }
1095     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1096 wakaba 1.11 ## XML5: "Tag attribute name after state".
1097    
1098 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1099     !!!cp (71);
1100     ## Stay in the state
1101     !!!next-input-character;
1102     redo A;
1103     } elsif ($self->{nc} == 0x003D) { # =
1104     !!!cp (72);
1105     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1106     !!!next-input-character;
1107     redo A;
1108     } elsif ($self->{nc} == 0x003E) { # >
1109 wakaba 1.11 if ($self->{is_xml}) {
1110     !!!cp (72.1);
1111     ## XML5: Not a parse error.
1112     !!!parse-error (type => 'no attr value'); ## TODO: type
1113     } else {
1114     !!!cp (72.2);
1115     }
1116    
1117 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1118     !!!cp (73);
1119     $self->{last_stag_name} = $self->{ct}->{tag_name};
1120     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1121     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1122     if ($self->{ct}->{attributes}) {
1123     !!!cp (74);
1124     !!!parse-error (type => 'end tag attribute');
1125     } else {
1126     ## NOTE: This state should never be reached.
1127     !!!cp (75);
1128     }
1129     } else {
1130     die "$0: $self->{ct}->{type}: Unknown token type";
1131     }
1132     $self->{state} = DATA_STATE;
1133 wakaba 1.5 $self->{s_kwd} = '';
1134 wakaba 1.1 !!!next-input-character;
1135    
1136     !!!emit ($self->{ct}); # start tag or end tag
1137    
1138     redo A;
1139     } elsif (0x0041 <= $self->{nc} and
1140     $self->{nc} <= 0x005A) { # A..Z
1141     !!!cp (76);
1142     $self->{ca}
1143 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1144 wakaba 1.1 value => '',
1145     line => $self->{line}, column => $self->{column}};
1146     $self->{state} = ATTRIBUTE_NAME_STATE;
1147     !!!next-input-character;
1148     redo A;
1149     } elsif ($self->{nc} == 0x002F) { # /
1150 wakaba 1.11 if ($self->{is_xml}) {
1151     !!!cp (77);
1152     ## XML5: Not a parse error.
1153     !!!parse-error (type => 'no attr value'); ## TODO: type
1154     } else {
1155     !!!cp (77.1);
1156     }
1157    
1158 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1159     !!!next-input-character;
1160     redo A;
1161     } elsif ($self->{nc} == -1) {
1162     !!!parse-error (type => 'unclosed tag');
1163     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1164     !!!cp (79);
1165     $self->{last_stag_name} = $self->{ct}->{tag_name};
1166     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1167     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1168     if ($self->{ct}->{attributes}) {
1169     !!!cp (80);
1170     !!!parse-error (type => 'end tag attribute');
1171     } else {
1172     ## NOTE: This state should never be reached.
1173     !!!cp (81);
1174     }
1175     } else {
1176     die "$0: $self->{ct}->{type}: Unknown token type";
1177     }
1178 wakaba 1.5 $self->{s_kwd} = '';
1179 wakaba 1.1 $self->{state} = DATA_STATE;
1180     # reconsume
1181    
1182 wakaba 1.33 ## Discard the token.
1183     #!!!emit ($self->{ct}); # start tag or end tag
1184 wakaba 1.1
1185     redo A;
1186     } else {
1187 wakaba 1.11 if ($self->{is_xml}) {
1188     !!!cp (78.1);
1189     ## XML5: Not a parse error.
1190     !!!parse-error (type => 'no attr value'); ## TODO: type
1191     } else {
1192     !!!cp (78.2);
1193     }
1194    
1195 wakaba 1.30 if ({
1196     0x0022 => 1, # "
1197     0x0027 => 1, # '
1198     0x003C => 1, # <
1199     }->{$self->{nc}}) {
1200 wakaba 1.1 !!!cp (78);
1201 wakaba 1.11 ## XML5: Not a parse error.
1202 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1203     } else {
1204     !!!cp (82);
1205     }
1206     $self->{ca}
1207     = {name => chr ($self->{nc}),
1208     value => '',
1209     line => $self->{line}, column => $self->{column}};
1210     $self->{state} = ATTRIBUTE_NAME_STATE;
1211     !!!next-input-character;
1212     redo A;
1213     }
1214     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1215 wakaba 1.11 ## XML5: "Tag attribute value before state".
1216    
1217 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1218     !!!cp (83);
1219     ## Stay in the state
1220     !!!next-input-character;
1221     redo A;
1222     } elsif ($self->{nc} == 0x0022) { # "
1223     !!!cp (84);
1224     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1225     !!!next-input-character;
1226     redo A;
1227     } elsif ($self->{nc} == 0x0026) { # &
1228     !!!cp (85);
1229     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1230     ## reconsume
1231     redo A;
1232     } elsif ($self->{nc} == 0x0027) { # '
1233     !!!cp (86);
1234     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1235     !!!next-input-character;
1236     redo A;
1237     } elsif ($self->{nc} == 0x003E) { # >
1238     !!!parse-error (type => 'empty unquoted attribute value');
1239     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1240     !!!cp (87);
1241     $self->{last_stag_name} = $self->{ct}->{tag_name};
1242     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1243     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1244     if ($self->{ct}->{attributes}) {
1245     !!!cp (88);
1246     !!!parse-error (type => 'end tag attribute');
1247     } else {
1248     ## NOTE: This state should never be reached.
1249     !!!cp (89);
1250     }
1251     } else {
1252     die "$0: $self->{ct}->{type}: Unknown token type";
1253     }
1254     $self->{state} = DATA_STATE;
1255 wakaba 1.5 $self->{s_kwd} = '';
1256 wakaba 1.1 !!!next-input-character;
1257    
1258     !!!emit ($self->{ct}); # start tag or end tag
1259    
1260     redo A;
1261     } elsif ($self->{nc} == -1) {
1262     !!!parse-error (type => 'unclosed tag');
1263     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1264     !!!cp (90);
1265     $self->{last_stag_name} = $self->{ct}->{tag_name};
1266     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1267     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1268     if ($self->{ct}->{attributes}) {
1269     !!!cp (91);
1270     !!!parse-error (type => 'end tag attribute');
1271     } else {
1272     ## NOTE: This state should never be reached.
1273     !!!cp (92);
1274     }
1275     } else {
1276     die "$0: $self->{ct}->{type}: Unknown token type";
1277     }
1278     $self->{state} = DATA_STATE;
1279 wakaba 1.5 $self->{s_kwd} = '';
1280 wakaba 1.1 ## reconsume
1281    
1282 wakaba 1.33 ## Discard the token.
1283     #!!!emit ($self->{ct}); # start tag or end tag
1284 wakaba 1.1
1285     redo A;
1286     } else {
1287 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1288 wakaba 1.1 !!!cp (93);
1289 wakaba 1.11 ## XML5: Not a parse error.
1290 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1291 wakaba 1.11 } elsif ($self->{is_xml}) {
1292     !!!cp (93.1);
1293     ## XML5: No parse error.
1294     !!!parse-error (type => 'unquoted attr value'); ## TODO
1295 wakaba 1.1 } else {
1296     !!!cp (94);
1297     }
1298     $self->{ca}->{value} .= chr ($self->{nc});
1299     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1300     !!!next-input-character;
1301     redo A;
1302     }
1303     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1304 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1305     ## ATTLIST attribute value double quoted state".
1306 wakaba 1.11
1307 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1308 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1309     !!!cp (95.1);
1310     ## XML5: "DOCTYPE ATTLIST name after state".
1311     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1312     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1313     } else {
1314     !!!cp (95);
1315     ## XML5: "Tag attribute name before state".
1316     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1317     }
1318 wakaba 1.1 !!!next-input-character;
1319     redo A;
1320     } elsif ($self->{nc} == 0x0026) { # &
1321     !!!cp (96);
1322 wakaba 1.11 ## XML5: Not defined yet.
1323    
1324 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1325     ## "entity in attribute value state". In this implementation, the
1326     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1327     ## implementation of the "consume a character reference" algorithm.
1328     $self->{prev_state} = $self->{state};
1329     $self->{entity_add} = 0x0022; # "
1330     $self->{state} = ENTITY_STATE;
1331     !!!next-input-character;
1332     redo A;
1333 wakaba 1.25 } elsif ($self->{is_xml} and
1334     $is_space->{$self->{nc}}) {
1335     !!!cp (97.1);
1336     $self->{ca}->{value} .= ' ';
1337     ## Stay in the state.
1338     !!!next-input-character;
1339     redo A;
1340 wakaba 1.1 } elsif ($self->{nc} == -1) {
1341     !!!parse-error (type => 'unclosed attribute value');
1342     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1343     !!!cp (97);
1344     $self->{last_stag_name} = $self->{ct}->{tag_name};
1345 wakaba 1.15
1346     $self->{state} = DATA_STATE;
1347     $self->{s_kwd} = '';
1348     ## reconsume
1349     !!!emit ($self->{ct}); # start tag
1350     redo A;
1351 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1352     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1353     if ($self->{ct}->{attributes}) {
1354     !!!cp (98);
1355     !!!parse-error (type => 'end tag attribute');
1356     } else {
1357     ## NOTE: This state should never be reached.
1358     !!!cp (99);
1359     }
1360 wakaba 1.15
1361     $self->{state} = DATA_STATE;
1362     $self->{s_kwd} = '';
1363     ## reconsume
1364 wakaba 1.33
1365     ## Discard the token.
1366     #!!!emit ($self->{ct}); # end tag
1367    
1368 wakaba 1.15 redo A;
1369     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1370     ## XML5: No parse error above; not defined yet.
1371     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1372     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1373     ## Reconsume.
1374 wakaba 1.33
1375     ## Discard the token.
1376     #!!!emit ($self->{ct}); # ATTLIST
1377    
1378 wakaba 1.15 redo A;
1379 wakaba 1.1 } else {
1380     die "$0: $self->{ct}->{type}: Unknown token type";
1381     }
1382     } else {
1383 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1384 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1385     !!!cp (100);
1386     ## XML5: Not a parse error.
1387     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1388     } else {
1389     !!!cp (100.1);
1390     }
1391 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1392     $self->{read_until}->($self->{ca}->{value},
1393 wakaba 1.25 qq["&<\x09\x0C\x20],
1394 wakaba 1.1 length $self->{ca}->{value});
1395    
1396     ## Stay in the state
1397     !!!next-input-character;
1398     redo A;
1399     }
1400     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1401 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1402     ## ATTLIST attribute value single quoted state".
1403 wakaba 1.11
1404 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1405 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1406     !!!cp (101.1);
1407     ## XML5: "DOCTYPE ATTLIST name after state".
1408     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1409     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1410     } else {
1411     !!!cp (101);
1412     ## XML5: "Before attribute name state" (sic).
1413     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1414     }
1415 wakaba 1.1 !!!next-input-character;
1416     redo A;
1417     } elsif ($self->{nc} == 0x0026) { # &
1418     !!!cp (102);
1419 wakaba 1.11 ## XML5: Not defined yet.
1420    
1421 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1422     ## "entity in attribute value state". In this implementation, the
1423     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1424     ## implementation of the "consume a character reference" algorithm.
1425     $self->{entity_add} = 0x0027; # '
1426     $self->{prev_state} = $self->{state};
1427     $self->{state} = ENTITY_STATE;
1428     !!!next-input-character;
1429     redo A;
1430 wakaba 1.25 } elsif ($self->{is_xml} and
1431     $is_space->{$self->{nc}}) {
1432     !!!cp (103.1);
1433     $self->{ca}->{value} .= ' ';
1434     ## Stay in the state.
1435     !!!next-input-character;
1436     redo A;
1437 wakaba 1.1 } elsif ($self->{nc} == -1) {
1438     !!!parse-error (type => 'unclosed attribute value');
1439     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1440     !!!cp (103);
1441     $self->{last_stag_name} = $self->{ct}->{tag_name};
1442 wakaba 1.15
1443     $self->{state} = DATA_STATE;
1444     $self->{s_kwd} = '';
1445     ## reconsume
1446 wakaba 1.33
1447     ## Discard the token.
1448     #!!!emit ($self->{ct}); # start tag
1449    
1450 wakaba 1.15 redo A;
1451 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1452     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1453     if ($self->{ct}->{attributes}) {
1454     !!!cp (104);
1455     !!!parse-error (type => 'end tag attribute');
1456     } else {
1457     ## NOTE: This state should never be reached.
1458     !!!cp (105);
1459     }
1460 wakaba 1.15
1461     $self->{state} = DATA_STATE;
1462     $self->{s_kwd} = '';
1463     ## reconsume
1464 wakaba 1.33
1465     ## Discard the token.
1466     #!!!emit ($self->{ct}); # end tag
1467    
1468 wakaba 1.15 redo A;
1469     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1470     ## XML5: No parse error above; not defined yet.
1471     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1472     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1473     ## Reconsume.
1474 wakaba 1.33
1475     ## Discard the token.
1476     #!!!emit ($self->{ct}); # ATTLIST
1477    
1478 wakaba 1.15 redo A;
1479 wakaba 1.1 } else {
1480     die "$0: $self->{ct}->{type}: Unknown token type";
1481     }
1482     } else {
1483 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1484 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1485     !!!cp (106);
1486     ## XML5: Not a parse error.
1487     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1488     } else {
1489     !!!cp (106.1);
1490     }
1491 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1492     $self->{read_until}->($self->{ca}->{value},
1493 wakaba 1.25 qq['&<\x09\x0C\x20],
1494 wakaba 1.1 length $self->{ca}->{value});
1495    
1496     ## Stay in the state
1497     !!!next-input-character;
1498     redo A;
1499     }
1500     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1501 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1502    
1503 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1504 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505     !!!cp (107.1);
1506     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1507     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1508     } else {
1509     !!!cp (107);
1510     ## XML5: "Tag attribute name before state".
1511     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1512     }
1513 wakaba 1.1 !!!next-input-character;
1514     redo A;
1515     } elsif ($self->{nc} == 0x0026) { # &
1516     !!!cp (108);
1517 wakaba 1.11
1518     ## XML5: Not defined yet.
1519    
1520 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1521     ## "entity in attribute value state". In this implementation, the
1522     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1523     ## implementation of the "consume a character reference" algorithm.
1524     $self->{entity_add} = -1;
1525     $self->{prev_state} = $self->{state};
1526     $self->{state} = ENTITY_STATE;
1527     !!!next-input-character;
1528     redo A;
1529     } elsif ($self->{nc} == 0x003E) { # >
1530     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1531     !!!cp (109);
1532     $self->{last_stag_name} = $self->{ct}->{tag_name};
1533 wakaba 1.15
1534     $self->{state} = DATA_STATE;
1535     $self->{s_kwd} = '';
1536     !!!next-input-character;
1537     !!!emit ($self->{ct}); # start tag
1538     redo A;
1539 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1540     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1541     if ($self->{ct}->{attributes}) {
1542     !!!cp (110);
1543     !!!parse-error (type => 'end tag attribute');
1544     } else {
1545     ## NOTE: This state should never be reached.
1546     !!!cp (111);
1547     }
1548 wakaba 1.15
1549     $self->{state} = DATA_STATE;
1550     $self->{s_kwd} = '';
1551     !!!next-input-character;
1552     !!!emit ($self->{ct}); # end tag
1553     redo A;
1554     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1555     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1556     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1557     !!!next-input-character;
1558     !!!emit ($self->{ct}); # ATTLIST
1559     redo A;
1560 wakaba 1.1 } else {
1561     die "$0: $self->{ct}->{type}: Unknown token type";
1562     }
1563     } elsif ($self->{nc} == -1) {
1564     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1565     !!!cp (112);
1566 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1567 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1568 wakaba 1.15
1569     $self->{state} = DATA_STATE;
1570     $self->{s_kwd} = '';
1571     ## reconsume
1572 wakaba 1.33
1573     ## Discard the token.
1574     #!!!emit ($self->{ct}); # start tag
1575    
1576 wakaba 1.15 redo A;
1577 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1579 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580     if ($self->{ct}->{attributes}) {
1581     !!!cp (113);
1582     !!!parse-error (type => 'end tag attribute');
1583     } else {
1584     ## NOTE: This state should never be reached.
1585     !!!cp (114);
1586     }
1587 wakaba 1.15
1588     $self->{state} = DATA_STATE;
1589     $self->{s_kwd} = '';
1590     ## reconsume
1591 wakaba 1.33
1592     ## Discard the token.
1593     #!!!emit ($self->{ct}); # end tag
1594    
1595 wakaba 1.15 redo A;
1596     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1597     !!!parse-error (type => 'unclosed md'); ## TODO: type
1598     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1599     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1600     ## Reconsume.
1601 wakaba 1.33
1602     ## Discard the token.
1603     #!!!emit ($self->{ct}); # ATTLIST
1604    
1605 wakaba 1.15 redo A;
1606 wakaba 1.1 } else {
1607     die "$0: $self->{ct}->{type}: Unknown token type";
1608     }
1609     } else {
1610     if ({
1611     0x0022 => 1, # "
1612     0x0027 => 1, # '
1613     0x003D => 1, # =
1614 wakaba 1.26 0x003C => 1, # <
1615 wakaba 1.1 }->{$self->{nc}}) {
1616     !!!cp (115);
1617 wakaba 1.11 ## XML5: Not a parse error.
1618 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1619     } else {
1620     !!!cp (116);
1621     }
1622     $self->{ca}->{value} .= chr ($self->{nc});
1623     $self->{read_until}->($self->{ca}->{value},
1624 wakaba 1.25 qq["'=& \x09\x0C>],
1625 wakaba 1.1 length $self->{ca}->{value});
1626    
1627     ## Stay in the state
1628     !!!next-input-character;
1629     redo A;
1630     }
1631     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1632     if ($is_space->{$self->{nc}}) {
1633     !!!cp (118);
1634     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1635     !!!next-input-character;
1636     redo A;
1637     } elsif ($self->{nc} == 0x003E) { # >
1638     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1639     !!!cp (119);
1640     $self->{last_stag_name} = $self->{ct}->{tag_name};
1641     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1642     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1643     if ($self->{ct}->{attributes}) {
1644     !!!cp (120);
1645     !!!parse-error (type => 'end tag attribute');
1646     } else {
1647     ## NOTE: This state should never be reached.
1648     !!!cp (121);
1649     }
1650     } else {
1651     die "$0: $self->{ct}->{type}: Unknown token type";
1652     }
1653     $self->{state} = DATA_STATE;
1654 wakaba 1.5 $self->{s_kwd} = '';
1655 wakaba 1.1 !!!next-input-character;
1656    
1657     !!!emit ($self->{ct}); # start tag or end tag
1658    
1659     redo A;
1660     } elsif ($self->{nc} == 0x002F) { # /
1661     !!!cp (122);
1662     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1663     !!!next-input-character;
1664     redo A;
1665     } elsif ($self->{nc} == -1) {
1666     !!!parse-error (type => 'unclosed tag');
1667     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1668     !!!cp (122.3);
1669     $self->{last_stag_name} = $self->{ct}->{tag_name};
1670     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1671     if ($self->{ct}->{attributes}) {
1672     !!!cp (122.1);
1673     !!!parse-error (type => 'end tag attribute');
1674     } else {
1675     ## NOTE: This state should never be reached.
1676     !!!cp (122.2);
1677     }
1678     } else {
1679     die "$0: $self->{ct}->{type}: Unknown token type";
1680     }
1681     $self->{state} = DATA_STATE;
1682 wakaba 1.5 $self->{s_kwd} = '';
1683 wakaba 1.1 ## Reconsume.
1684 wakaba 1.33
1685     ## Discard the token.
1686     #!!!emit ($self->{ct}); # start tag or end tag
1687    
1688 wakaba 1.1 redo A;
1689     } else {
1690     !!!cp ('124.1');
1691     !!!parse-error (type => 'no space between attributes');
1692     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1693     ## reconsume
1694     redo A;
1695     }
1696     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1697 wakaba 1.11 ## XML5: "Empty tag state".
1698    
1699 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1700     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1701     !!!cp ('124.2');
1702     !!!parse-error (type => 'nestc', token => $self->{ct});
1703     ## TODO: Different type than slash in start tag
1704     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1705     if ($self->{ct}->{attributes}) {
1706     !!!cp ('124.4');
1707     !!!parse-error (type => 'end tag attribute');
1708     } else {
1709     !!!cp ('124.5');
1710     }
1711     ## TODO: Test |<title></title/>|
1712     } else {
1713     !!!cp ('124.3');
1714     $self->{self_closing} = 1;
1715     }
1716    
1717     $self->{state} = DATA_STATE;
1718 wakaba 1.5 $self->{s_kwd} = '';
1719 wakaba 1.1 !!!next-input-character;
1720    
1721     !!!emit ($self->{ct}); # start tag or end tag
1722    
1723     redo A;
1724     } elsif ($self->{nc} == -1) {
1725     !!!parse-error (type => 'unclosed tag');
1726     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1727     !!!cp (124.7);
1728     $self->{last_stag_name} = $self->{ct}->{tag_name};
1729     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1730     if ($self->{ct}->{attributes}) {
1731     !!!cp (124.5);
1732     !!!parse-error (type => 'end tag attribute');
1733     } else {
1734     ## NOTE: This state should never be reached.
1735     !!!cp (124.6);
1736     }
1737     } else {
1738     die "$0: $self->{ct}->{type}: Unknown token type";
1739     }
1740 wakaba 1.11 ## XML5: "Tag attribute name before state".
1741 wakaba 1.1 $self->{state} = DATA_STATE;
1742 wakaba 1.5 $self->{s_kwd} = '';
1743 wakaba 1.1 ## Reconsume.
1744 wakaba 1.33
1745     ## Discard the token.
1746     #!!!emit ($self->{ct}); # start tag or end tag
1747    
1748 wakaba 1.1 redo A;
1749     } else {
1750     !!!cp ('124.4');
1751     !!!parse-error (type => 'nestc');
1752     ## TODO: This error type is wrong.
1753     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1754     ## Reconsume.
1755     redo A;
1756     }
1757     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1758 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1759    
1760 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1761     ## consumes characters one-by-one basis.
1762    
1763     if ($self->{nc} == 0x003E) { # >
1764 wakaba 1.13 if ($self->{in_subset}) {
1765     !!!cp (123);
1766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1767     } else {
1768     !!!cp (124);
1769     $self->{state} = DATA_STATE;
1770     $self->{s_kwd} = '';
1771     }
1772 wakaba 1.1 !!!next-input-character;
1773    
1774     !!!emit ($self->{ct}); # comment
1775     redo A;
1776     } elsif ($self->{nc} == -1) {
1777 wakaba 1.13 if ($self->{in_subset}) {
1778     !!!cp (125.1);
1779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1780     } else {
1781     !!!cp (125);
1782     $self->{state} = DATA_STATE;
1783     $self->{s_kwd} = '';
1784     }
1785 wakaba 1.1 ## reconsume
1786    
1787     !!!emit ($self->{ct}); # comment
1788     redo A;
1789     } else {
1790     !!!cp (126);
1791     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1792     $self->{read_until}->($self->{ct}->{data},
1793     q[>],
1794     length $self->{ct}->{data});
1795    
1796     ## Stay in the state.
1797     !!!next-input-character;
1798     redo A;
1799     }
1800     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1801 wakaba 1.14 ## XML5: "Markup declaration state".
1802 wakaba 1.1
1803     if ($self->{nc} == 0x002D) { # -
1804     !!!cp (133);
1805     $self->{state} = MD_HYPHEN_STATE;
1806     !!!next-input-character;
1807     redo A;
1808     } elsif ($self->{nc} == 0x0044 or # D
1809     $self->{nc} == 0x0064) { # d
1810     ## ASCII case-insensitive.
1811     !!!cp (130);
1812     $self->{state} = MD_DOCTYPE_STATE;
1813 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1814 wakaba 1.1 !!!next-input-character;
1815     redo A;
1816 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1817     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1818     $self->{is_xml}) and
1819 wakaba 1.1 $self->{nc} == 0x005B) { # [
1820     !!!cp (135.4);
1821     $self->{state} = MD_CDATA_STATE;
1822 wakaba 1.12 $self->{kwd} = '[';
1823 wakaba 1.1 !!!next-input-character;
1824     redo A;
1825     } else {
1826     !!!cp (136);
1827     }
1828    
1829     !!!parse-error (type => 'bogus comment',
1830     line => $self->{line_prev},
1831     column => $self->{column_prev} - 1);
1832     ## Reconsume.
1833     $self->{state} = BOGUS_COMMENT_STATE;
1834     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1835     line => $self->{line_prev},
1836     column => $self->{column_prev} - 1,
1837     };
1838     redo A;
1839     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1840     if ($self->{nc} == 0x002D) { # -
1841     !!!cp (127);
1842     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1843     line => $self->{line_prev},
1844     column => $self->{column_prev} - 2,
1845     };
1846 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1847 wakaba 1.1 !!!next-input-character;
1848     redo A;
1849     } else {
1850     !!!cp (128);
1851     !!!parse-error (type => 'bogus comment',
1852     line => $self->{line_prev},
1853     column => $self->{column_prev} - 2);
1854     $self->{state} = BOGUS_COMMENT_STATE;
1855     ## Reconsume.
1856     $self->{ct} = {type => COMMENT_TOKEN,
1857     data => '-',
1858     line => $self->{line_prev},
1859     column => $self->{column_prev} - 2,
1860     };
1861     redo A;
1862     }
1863     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1864     ## ASCII case-insensitive.
1865     if ($self->{nc} == [
1866     undef,
1867     0x004F, # O
1868     0x0043, # C
1869     0x0054, # T
1870     0x0059, # Y
1871     0x0050, # P
1872 wakaba 1.12 ]->[length $self->{kwd}] or
1873 wakaba 1.1 $self->{nc} == [
1874     undef,
1875     0x006F, # o
1876     0x0063, # c
1877     0x0074, # t
1878     0x0079, # y
1879     0x0070, # p
1880 wakaba 1.12 ]->[length $self->{kwd}]) {
1881 wakaba 1.1 !!!cp (131);
1882     ## Stay in the state.
1883 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1884 wakaba 1.1 !!!next-input-character;
1885     redo A;
1886 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1887 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1888     $self->{nc} == 0x0065)) { # e
1889 wakaba 1.12 if ($self->{is_xml} and
1890     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1891 wakaba 1.10 !!!cp (129);
1892     ## XML5: case-sensitive.
1893     !!!parse-error (type => 'lowercase keyword', ## TODO
1894     text => 'DOCTYPE',
1895     line => $self->{line_prev},
1896     column => $self->{column_prev} - 5);
1897     } else {
1898     !!!cp (129.1);
1899     }
1900 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1901     $self->{ct} = {type => DOCTYPE_TOKEN,
1902     quirks => 1,
1903     line => $self->{line_prev},
1904     column => $self->{column_prev} - 7,
1905     };
1906     !!!next-input-character;
1907     redo A;
1908     } else {
1909     !!!cp (132);
1910     !!!parse-error (type => 'bogus comment',
1911     line => $self->{line_prev},
1912 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1913 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1914     ## Reconsume.
1915     $self->{ct} = {type => COMMENT_TOKEN,
1916 wakaba 1.12 data => $self->{kwd},
1917 wakaba 1.1 line => $self->{line_prev},
1918 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1919 wakaba 1.1 };
1920     redo A;
1921     }
1922     } elsif ($self->{state} == MD_CDATA_STATE) {
1923     if ($self->{nc} == {
1924     '[' => 0x0043, # C
1925     '[C' => 0x0044, # D
1926     '[CD' => 0x0041, # A
1927     '[CDA' => 0x0054, # T
1928     '[CDAT' => 0x0041, # A
1929 wakaba 1.12 }->{$self->{kwd}}) {
1930 wakaba 1.1 !!!cp (135.1);
1931     ## Stay in the state.
1932 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1933 wakaba 1.1 !!!next-input-character;
1934     redo A;
1935 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1936 wakaba 1.1 $self->{nc} == 0x005B) { # [
1937 wakaba 1.6 if ($self->{is_xml} and
1938     not $self->{tainted} and
1939     @{$self->{open_elements} or []} == 0) {
1940 wakaba 1.8 !!!cp (135.2);
1941 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1942     line => $self->{line_prev},
1943     column => $self->{column_prev} - 7);
1944     $self->{tainted} = 1;
1945 wakaba 1.8 } else {
1946     !!!cp (135.21);
1947 wakaba 1.6 }
1948    
1949 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1950     data => '',
1951     line => $self->{line_prev},
1952     column => $self->{column_prev} - 7};
1953     $self->{state} = CDATA_SECTION_STATE;
1954     !!!next-input-character;
1955     redo A;
1956     } else {
1957     !!!cp (135.3);
1958     !!!parse-error (type => 'bogus comment',
1959     line => $self->{line_prev},
1960 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1961 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1962     ## Reconsume.
1963     $self->{ct} = {type => COMMENT_TOKEN,
1964 wakaba 1.12 data => $self->{kwd},
1965 wakaba 1.1 line => $self->{line_prev},
1966 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1967 wakaba 1.1 };
1968     redo A;
1969     }
1970     } elsif ($self->{state} == COMMENT_START_STATE) {
1971     if ($self->{nc} == 0x002D) { # -
1972     !!!cp (137);
1973     $self->{state} = COMMENT_START_DASH_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} == 0x003E) { # >
1977     !!!parse-error (type => 'bogus comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (138.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (138);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 !!!next-input-character;
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (139.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (139);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (140);
2008     $self->{ct}->{data} # comment
2009     .= chr ($self->{nc});
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2015     if ($self->{nc} == 0x002D) { # -
2016     !!!cp (141);
2017     $self->{state} = COMMENT_END_STATE;
2018     !!!next-input-character;
2019     redo A;
2020     } elsif ($self->{nc} == 0x003E) { # >
2021     !!!parse-error (type => 'bogus comment');
2022 wakaba 1.13 if ($self->{in_subset}) {
2023     !!!cp (142.1);
2024     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2025     } else {
2026     !!!cp (142);
2027     $self->{state} = DATA_STATE;
2028     $self->{s_kwd} = '';
2029     }
2030 wakaba 1.1 !!!next-input-character;
2031    
2032     !!!emit ($self->{ct}); # comment
2033    
2034     redo A;
2035     } elsif ($self->{nc} == -1) {
2036     !!!parse-error (type => 'unclosed comment');
2037 wakaba 1.13 if ($self->{in_subset}) {
2038     !!!cp (143.1);
2039     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2040     } else {
2041     !!!cp (143);
2042     $self->{state} = DATA_STATE;
2043     $self->{s_kwd} = '';
2044     }
2045 wakaba 1.1 ## reconsume
2046    
2047     !!!emit ($self->{ct}); # comment
2048    
2049     redo A;
2050     } else {
2051     !!!cp (144);
2052     $self->{ct}->{data} # comment
2053     .= '-' . chr ($self->{nc});
2054     $self->{state} = COMMENT_STATE;
2055     !!!next-input-character;
2056     redo A;
2057     }
2058     } elsif ($self->{state} == COMMENT_STATE) {
2059 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2060    
2061 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2062     !!!cp (145);
2063     $self->{state} = COMMENT_END_DASH_STATE;
2064     !!!next-input-character;
2065     redo A;
2066     } elsif ($self->{nc} == -1) {
2067     !!!parse-error (type => 'unclosed comment');
2068 wakaba 1.13 if ($self->{in_subset}) {
2069     !!!cp (146.1);
2070     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2071     } else {
2072     !!!cp (146);
2073     $self->{state} = DATA_STATE;
2074     $self->{s_kwd} = '';
2075     }
2076 wakaba 1.1 ## reconsume
2077    
2078     !!!emit ($self->{ct}); # comment
2079    
2080     redo A;
2081     } else {
2082     !!!cp (147);
2083     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2084     $self->{read_until}->($self->{ct}->{data},
2085     q[-],
2086     length $self->{ct}->{data});
2087    
2088     ## Stay in the state
2089     !!!next-input-character;
2090     redo A;
2091     }
2092     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2093 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2094 wakaba 1.10
2095 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2096     !!!cp (148);
2097     $self->{state} = COMMENT_END_STATE;
2098     !!!next-input-character;
2099     redo A;
2100     } elsif ($self->{nc} == -1) {
2101     !!!parse-error (type => 'unclosed comment');
2102 wakaba 1.13 if ($self->{in_subset}) {
2103     !!!cp (149.1);
2104     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2105     } else {
2106     !!!cp (149);
2107     $self->{state} = DATA_STATE;
2108     $self->{s_kwd} = '';
2109     }
2110 wakaba 1.1 ## reconsume
2111    
2112     !!!emit ($self->{ct}); # comment
2113    
2114     redo A;
2115     } else {
2116     !!!cp (150);
2117     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2118     $self->{state} = COMMENT_STATE;
2119     !!!next-input-character;
2120     redo A;
2121     }
2122 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2123     $self->{state} == COMMENT_END_BANG_STATE) {
2124 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2125 wakaba 1.31 ## (No comment end bang state.)
2126 wakaba 1.14
2127 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2128 wakaba 1.13 if ($self->{in_subset}) {
2129     !!!cp (151.1);
2130     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2131     } else {
2132     !!!cp (151);
2133     $self->{state} = DATA_STATE;
2134     $self->{s_kwd} = '';
2135     }
2136 wakaba 1.1 !!!next-input-character;
2137    
2138     !!!emit ($self->{ct}); # comment
2139    
2140     redo A;
2141     } elsif ($self->{nc} == 0x002D) { # -
2142 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2143     !!!cp (154.3);
2144     $self->{ct}->{data} .= '--!'; # comment
2145     $self->{state} = COMMENT_END_DASH_STATE;
2146     } else {
2147     !!!cp (152);
2148     ## XML5: Not a parse error.
2149     !!!parse-error (type => 'dash in comment',
2150     line => $self->{line_prev},
2151     column => $self->{column_prev});
2152     $self->{ct}->{data} .= '-'; # comment
2153     ## Stay in the state
2154     }
2155     !!!next-input-character;
2156     redo A;
2157 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2158     $is_space->{$self->{nc}}) {
2159     !!!cp (152.1);
2160     !!!parse-error (type => 'comment end space'); # XXX error type
2161     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2162     $self->{state} = COMMENT_END_SPACE_STATE;
2163     !!!next-input-character;
2164     redo A;
2165     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2166     $self->{nc} == 0x0021) { # !
2167     !!!cp (152.2);
2168 wakaba 1.31 !!!parse-error (type => 'comment end bang'); # XXX error type
2169     $self->{state} = COMMENT_END_BANG_STATE;
2170 wakaba 1.1 !!!next-input-character;
2171     redo A;
2172     } elsif ($self->{nc} == -1) {
2173     !!!parse-error (type => 'unclosed comment');
2174 wakaba 1.13 if ($self->{in_subset}) {
2175     !!!cp (153.1);
2176     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2177     } else {
2178     !!!cp (153);
2179     $self->{state} = DATA_STATE;
2180     $self->{s_kwd} = '';
2181     }
2182 wakaba 1.31 ## Reconsume.
2183 wakaba 1.1
2184     !!!emit ($self->{ct}); # comment
2185    
2186     redo A;
2187     } else {
2188     !!!cp (154);
2189 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2190     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2191     } else {
2192     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2193     }
2194 wakaba 1.1 $self->{state} = COMMENT_STATE;
2195     !!!next-input-character;
2196     redo A;
2197     }
2198 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2199     ## XML5: Not exist.
2200    
2201     if ($self->{nc} == 0x003E) { # >
2202     if ($self->{in_subset}) {
2203     !!!cp (154.4);
2204     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2205     } else {
2206     !!!cp (154.5);
2207     $self->{state} = DATA_STATE;
2208     $self->{s_kwd} = '';
2209     }
2210     !!!next-input-character;
2211    
2212     !!!emit ($self->{ct}); # comment
2213    
2214     redo A;
2215     } elsif ($is_space->{$self->{nc}}) {
2216     !!!cp (154.6);
2217     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2218     ## Stay in the state.
2219     !!!next-input-character;
2220     redo A;
2221     } elsif ($self->{nc} == -1) {
2222     !!!parse-error (type => 'unclosed comment');
2223     if ($self->{in_subset}) {
2224     !!!cp (154.7);
2225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2226     } else {
2227     !!!cp (154.8);
2228     $self->{state} = DATA_STATE;
2229     $self->{s_kwd} = '';
2230     }
2231     ## Reconsume.
2232    
2233     !!!emit ($self->{ct}); # comment
2234    
2235     redo A;
2236     } else {
2237     !!!cp (154.9);
2238     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2239     $self->{state} = COMMENT_STATE;
2240     !!!next-input-character;
2241     redo A;
2242     }
2243 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
2244     if ($is_space->{$self->{nc}}) {
2245     !!!cp (155);
2246     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2247     !!!next-input-character;
2248     redo A;
2249 wakaba 1.28 } elsif ($self->{nc} == -1) {
2250     !!!cp (155.1);
2251     !!!parse-error (type => 'unclosed DOCTYPE');
2252     $self->{ct}->{quirks} = 1;
2253    
2254     $self->{state} = DATA_STATE;
2255     ## Reconsume.
2256     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2257    
2258     redo A;
2259 wakaba 1.1 } else {
2260     !!!cp (156);
2261 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2262 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2263     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2264     ## reconsume
2265     redo A;
2266     }
2267     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2268 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2269    
2270 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2271     !!!cp (157);
2272     ## Stay in the state
2273     !!!next-input-character;
2274     redo A;
2275     } elsif ($self->{nc} == 0x003E) { # >
2276     !!!cp (158);
2277 wakaba 1.12 ## XML5: No parse error.
2278 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2279     $self->{state} = DATA_STATE;
2280 wakaba 1.5 $self->{s_kwd} = '';
2281 wakaba 1.1 !!!next-input-character;
2282    
2283     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2284    
2285     redo A;
2286 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2287     !!!cp (158.1);
2288     $self->{ct}->{name} # DOCTYPE
2289     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2290     delete $self->{ct}->{quirks};
2291     $self->{state} = DOCTYPE_NAME_STATE;
2292     !!!next-input-character;
2293     redo A;
2294 wakaba 1.1 } elsif ($self->{nc} == -1) {
2295     !!!cp (159);
2296     !!!parse-error (type => 'no DOCTYPE name');
2297     $self->{state} = DATA_STATE;
2298 wakaba 1.5 $self->{s_kwd} = '';
2299 wakaba 1.1 ## reconsume
2300    
2301     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2302    
2303     redo A;
2304 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2305     !!!cp (159.1);
2306     !!!parse-error (type => 'no DOCTYPE name');
2307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2308 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2309     $self->{in_subset} = 1;
2310 wakaba 1.12 !!!next-input-character;
2311 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2312 wakaba 1.12 redo A;
2313 wakaba 1.1 } else {
2314     !!!cp (160);
2315     $self->{ct}->{name} = chr $self->{nc};
2316     delete $self->{ct}->{quirks};
2317     $self->{state} = DOCTYPE_NAME_STATE;
2318     !!!next-input-character;
2319     redo A;
2320     }
2321     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2322 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2323    
2324     ## ISSUE: Redundant "First," in the spec.
2325    
2326 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2327     !!!cp (161);
2328     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2329     !!!next-input-character;
2330     redo A;
2331     } elsif ($self->{nc} == 0x003E) { # >
2332     !!!cp (162);
2333     $self->{state} = DATA_STATE;
2334 wakaba 1.5 $self->{s_kwd} = '';
2335 wakaba 1.1 !!!next-input-character;
2336    
2337     !!!emit ($self->{ct}); # DOCTYPE
2338    
2339     redo A;
2340 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2341     !!!cp (162.1);
2342     $self->{ct}->{name} # DOCTYPE
2343     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2344     delete $self->{ct}->{quirks};
2345     ## Stay in the state.
2346     !!!next-input-character;
2347     redo A;
2348 wakaba 1.1 } elsif ($self->{nc} == -1) {
2349     !!!cp (163);
2350     !!!parse-error (type => 'unclosed DOCTYPE');
2351     $self->{state} = DATA_STATE;
2352 wakaba 1.5 $self->{s_kwd} = '';
2353 wakaba 1.1 ## reconsume
2354    
2355     $self->{ct}->{quirks} = 1;
2356     !!!emit ($self->{ct}); # DOCTYPE
2357    
2358     redo A;
2359 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2360     !!!cp (163.1);
2361     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2362 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2363     $self->{in_subset} = 1;
2364 wakaba 1.12 !!!next-input-character;
2365 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2366 wakaba 1.12 redo A;
2367 wakaba 1.1 } else {
2368     !!!cp (164);
2369 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2370     ## Stay in the state.
2371 wakaba 1.1 !!!next-input-character;
2372     redo A;
2373     }
2374     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2375 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2376     ## state", but implemented differently.
2377    
2378 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2379     !!!cp (165);
2380     ## Stay in the state
2381     !!!next-input-character;
2382     redo A;
2383     } elsif ($self->{nc} == 0x003E) { # >
2384 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2385     !!!cp (166);
2386     $self->{state} = DATA_STATE;
2387     $self->{s_kwd} = '';
2388     } else {
2389     !!!cp (166.1);
2390     !!!parse-error (type => 'no md def'); ## TODO: type
2391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2392     }
2393    
2394 wakaba 1.1 !!!next-input-character;
2395 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2396 wakaba 1.1 redo A;
2397     } elsif ($self->{nc} == -1) {
2398 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2399     !!!cp (167);
2400     !!!parse-error (type => 'unclosed DOCTYPE');
2401     $self->{state} = DATA_STATE;
2402     $self->{s_kwd} = '';
2403     $self->{ct}->{quirks} = 1;
2404     } else {
2405     !!!cp (167.12);
2406     !!!parse-error (type => 'unclosed md'); ## TODO: type
2407     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2408     }
2409    
2410     ## Reconsume.
2411     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2412 wakaba 1.1 redo A;
2413     } elsif ($self->{nc} == 0x0050 or # P
2414     $self->{nc} == 0x0070) { # p
2415 wakaba 1.12 !!!cp (167.1);
2416 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2417 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2418 wakaba 1.1 !!!next-input-character;
2419     redo A;
2420     } elsif ($self->{nc} == 0x0053 or # S
2421     $self->{nc} == 0x0073) { # s
2422 wakaba 1.12 !!!cp (167.2);
2423 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2424 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2425     !!!next-input-character;
2426     redo A;
2427 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2428     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2429     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2430     !!!cp (167.21);
2431     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2432     $self->{ct}->{value} = ''; # ENTITY
2433     !!!next-input-character;
2434     redo A;
2435     } elsif ($self->{nc} == 0x0027 and # '
2436     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2437     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2438     !!!cp (167.22);
2439     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2440     $self->{ct}->{value} = ''; # ENTITY
2441     !!!next-input-character;
2442     redo A;
2443 wakaba 1.16 } elsif ($self->{is_xml} and
2444     $self->{ct}->{type} == DOCTYPE_TOKEN and
2445     $self->{nc} == 0x005B) { # [
2446 wakaba 1.12 !!!cp (167.3);
2447     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2448     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2449 wakaba 1.13 $self->{in_subset} = 1;
2450 wakaba 1.1 !!!next-input-character;
2451 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2452 wakaba 1.1 redo A;
2453     } else {
2454 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2455    
2456     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2457     !!!cp (180);
2458     $self->{ct}->{quirks} = 1;
2459     $self->{state} = BOGUS_DOCTYPE_STATE;
2460     } else {
2461     !!!cp (180.1);
2462     $self->{state} = BOGUS_MD_STATE;
2463     }
2464 wakaba 1.1
2465     !!!next-input-character;
2466     redo A;
2467     }
2468     } elsif ($self->{state} == PUBLIC_STATE) {
2469     ## ASCII case-insensitive
2470     if ($self->{nc} == [
2471     undef,
2472     0x0055, # U
2473     0x0042, # B
2474     0x004C, # L
2475     0x0049, # I
2476 wakaba 1.12 ]->[length $self->{kwd}] or
2477 wakaba 1.1 $self->{nc} == [
2478     undef,
2479     0x0075, # u
2480     0x0062, # b
2481     0x006C, # l
2482     0x0069, # i
2483 wakaba 1.12 ]->[length $self->{kwd}]) {
2484 wakaba 1.1 !!!cp (175);
2485     ## Stay in the state.
2486 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2487 wakaba 1.1 !!!next-input-character;
2488     redo A;
2489 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2490 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2491     $self->{nc} == 0x0063)) { # c
2492 wakaba 1.12 if ($self->{is_xml} and
2493     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2494     !!!cp (168.1);
2495     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2496     text => 'PUBLIC',
2497     line => $self->{line_prev},
2498     column => $self->{column_prev} - 4);
2499     } else {
2500     !!!cp (168);
2501     }
2502 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2503     !!!next-input-character;
2504     redo A;
2505     } else {
2506 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2507 wakaba 1.1 line => $self->{line_prev},
2508 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2509 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2510     !!!cp (169);
2511     $self->{ct}->{quirks} = 1;
2512     $self->{state} = BOGUS_DOCTYPE_STATE;
2513     } else {
2514     !!!cp (169.1);
2515     $self->{state} = BOGUS_MD_STATE;
2516     }
2517 wakaba 1.1 ## Reconsume.
2518     redo A;
2519     }
2520     } elsif ($self->{state} == SYSTEM_STATE) {
2521     ## ASCII case-insensitive
2522     if ($self->{nc} == [
2523     undef,
2524     0x0059, # Y
2525     0x0053, # S
2526     0x0054, # T
2527     0x0045, # E
2528 wakaba 1.12 ]->[length $self->{kwd}] or
2529 wakaba 1.1 $self->{nc} == [
2530     undef,
2531     0x0079, # y
2532     0x0073, # s
2533     0x0074, # t
2534     0x0065, # e
2535 wakaba 1.12 ]->[length $self->{kwd}]) {
2536 wakaba 1.1 !!!cp (170);
2537     ## Stay in the state.
2538 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2539 wakaba 1.1 !!!next-input-character;
2540     redo A;
2541 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2542 wakaba 1.1 ($self->{nc} == 0x004D or # M
2543     $self->{nc} == 0x006D)) { # m
2544 wakaba 1.12 if ($self->{is_xml} and
2545     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2546     !!!cp (171.1);
2547     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2548     text => 'SYSTEM',
2549     line => $self->{line_prev},
2550     column => $self->{column_prev} - 4);
2551     } else {
2552     !!!cp (171);
2553     }
2554 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2555     !!!next-input-character;
2556     redo A;
2557     } else {
2558 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2559 wakaba 1.1 line => $self->{line_prev},
2560 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2561 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2562     !!!cp (172);
2563     $self->{ct}->{quirks} = 1;
2564     $self->{state} = BOGUS_DOCTYPE_STATE;
2565     } else {
2566     !!!cp (172.1);
2567     $self->{state} = BOGUS_MD_STATE;
2568     }
2569 wakaba 1.1 ## Reconsume.
2570     redo A;
2571     }
2572     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2573     if ($is_space->{$self->{nc}}) {
2574     !!!cp (181);
2575     ## Stay in the state
2576     !!!next-input-character;
2577     redo A;
2578     } elsif ($self->{nc} eq 0x0022) { # "
2579     !!!cp (182);
2580     $self->{ct}->{pubid} = ''; # DOCTYPE
2581     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2582     !!!next-input-character;
2583     redo A;
2584     } elsif ($self->{nc} eq 0x0027) { # '
2585     !!!cp (183);
2586     $self->{ct}->{pubid} = ''; # DOCTYPE
2587     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2588     !!!next-input-character;
2589     redo A;
2590     } elsif ($self->{nc} eq 0x003E) { # >
2591     !!!parse-error (type => 'no PUBLIC literal');
2592 wakaba 1.16
2593     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2594     !!!cp (184);
2595     $self->{state} = DATA_STATE;
2596     $self->{s_kwd} = '';
2597     $self->{ct}->{quirks} = 1;
2598     } else {
2599     !!!cp (184.1);
2600     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2601     }
2602    
2603 wakaba 1.1 !!!next-input-character;
2604 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2605 wakaba 1.1 redo A;
2606     } elsif ($self->{nc} == -1) {
2607 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2608     !!!cp (185);
2609     !!!parse-error (type => 'unclosed DOCTYPE');
2610     $self->{state} = DATA_STATE;
2611     $self->{s_kwd} = '';
2612     $self->{ct}->{quirks} = 1;
2613     } else {
2614     !!!cp (185.1);
2615     !!!parse-error (type => 'unclosed md'); ## TODO: type
2616     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2617     }
2618    
2619 wakaba 1.1 ## reconsume
2620     !!!emit ($self->{ct}); # DOCTYPE
2621     redo A;
2622 wakaba 1.16 } elsif ($self->{is_xml} and
2623     $self->{ct}->{type} == DOCTYPE_TOKEN and
2624     $self->{nc} == 0x005B) { # [
2625 wakaba 1.12 !!!cp (186.1);
2626     !!!parse-error (type => 'no PUBLIC literal');
2627     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2628     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2629 wakaba 1.13 $self->{in_subset} = 1;
2630 wakaba 1.12 !!!next-input-character;
2631 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2632 wakaba 1.12 redo A;
2633 wakaba 1.1 } else {
2634     !!!parse-error (type => 'string after PUBLIC');
2635    
2636 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2637     !!!cp (186);
2638     $self->{ct}->{quirks} = 1;
2639     $self->{state} = BOGUS_DOCTYPE_STATE;
2640     } else {
2641     !!!cp (186.2);
2642     $self->{state} = BOGUS_MD_STATE;
2643     }
2644    
2645 wakaba 1.1 !!!next-input-character;
2646     redo A;
2647     }
2648     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2649     if ($self->{nc} == 0x0022) { # "
2650     !!!cp (187);
2651     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2652     !!!next-input-character;
2653     redo A;
2654     } elsif ($self->{nc} == 0x003E) { # >
2655     !!!parse-error (type => 'unclosed PUBLIC literal');
2656    
2657 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2658     !!!cp (188);
2659     $self->{state} = DATA_STATE;
2660     $self->{s_kwd} = '';
2661     $self->{ct}->{quirks} = 1;
2662     } else {
2663     !!!cp (188.1);
2664     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2665     }
2666    
2667 wakaba 1.1 !!!next-input-character;
2668 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2669 wakaba 1.1 redo A;
2670     } elsif ($self->{nc} == -1) {
2671     !!!parse-error (type => 'unclosed PUBLIC literal');
2672    
2673 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2674     !!!cp (189);
2675     $self->{state} = DATA_STATE;
2676     $self->{s_kwd} = '';
2677     $self->{ct}->{quirks} = 1;
2678     } else {
2679     !!!cp (189.1);
2680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2681     }
2682    
2683     ## Reconsume.
2684 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2685     redo A;
2686     } else {
2687     !!!cp (190);
2688 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2689 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2690     length $self->{ct}->{pubid});
2691    
2692     ## Stay in the state
2693     !!!next-input-character;
2694     redo A;
2695     }
2696     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2697     if ($self->{nc} == 0x0027) { # '
2698     !!!cp (191);
2699     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2700     !!!next-input-character;
2701     redo A;
2702     } elsif ($self->{nc} == 0x003E) { # >
2703     !!!parse-error (type => 'unclosed PUBLIC literal');
2704    
2705 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2706     !!!cp (192);
2707     $self->{state} = DATA_STATE;
2708     $self->{s_kwd} = '';
2709     $self->{ct}->{quirks} = 1;
2710     } else {
2711     !!!cp (192.1);
2712     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713     }
2714    
2715 wakaba 1.1 !!!next-input-character;
2716 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2717 wakaba 1.1 redo A;
2718     } elsif ($self->{nc} == -1) {
2719     !!!parse-error (type => 'unclosed PUBLIC literal');
2720    
2721 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2722     !!!cp (193);
2723     $self->{state} = DATA_STATE;
2724     $self->{s_kwd} = '';
2725     $self->{ct}->{quirks} = 1;
2726     } else {
2727     !!!cp (193.1);
2728     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2729     }
2730    
2731 wakaba 1.1 ## reconsume
2732 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2733 wakaba 1.1 redo A;
2734     } else {
2735     !!!cp (194);
2736 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2737 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2738     length $self->{ct}->{pubid});
2739    
2740     ## Stay in the state
2741     !!!next-input-character;
2742     redo A;
2743     }
2744     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2745     if ($is_space->{$self->{nc}}) {
2746     !!!cp (195);
2747     ## Stay in the state
2748     !!!next-input-character;
2749     redo A;
2750     } elsif ($self->{nc} == 0x0022) { # "
2751     !!!cp (196);
2752 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2753 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2754     !!!next-input-character;
2755     redo A;
2756     } elsif ($self->{nc} == 0x0027) { # '
2757     !!!cp (197);
2758 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2759 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2760     !!!next-input-character;
2761     redo A;
2762     } elsif ($self->{nc} == 0x003E) { # >
2763 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2764     if ($self->{is_xml}) {
2765     !!!cp (198.1);
2766     !!!parse-error (type => 'no SYSTEM literal');
2767     } else {
2768     !!!cp (198);
2769     }
2770     $self->{state} = DATA_STATE;
2771     $self->{s_kwd} = '';
2772 wakaba 1.12 } else {
2773 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2774     !!!cp (198.2);
2775     } else {
2776     !!!cp (198.3);
2777     !!!parse-error (type => 'no SYSTEM literal');
2778     }
2779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780 wakaba 1.12 }
2781 wakaba 1.16
2782 wakaba 1.1 !!!next-input-character;
2783 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2784 wakaba 1.1 redo A;
2785     } elsif ($self->{nc} == -1) {
2786 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2787     !!!cp (199);
2788     !!!parse-error (type => 'unclosed DOCTYPE');
2789    
2790     $self->{state} = DATA_STATE;
2791     $self->{s_kwd} = '';
2792     $self->{ct}->{quirks} = 1;
2793     } else {
2794     !!!parse-error (type => 'unclosed md'); ## TODO: type
2795     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2796     }
2797    
2798 wakaba 1.1 ## reconsume
2799 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2800 wakaba 1.1 redo A;
2801 wakaba 1.16 } elsif ($self->{is_xml} and
2802     $self->{ct}->{type} == DOCTYPE_TOKEN and
2803     $self->{nc} == 0x005B) { # [
2804 wakaba 1.12 !!!cp (200.1);
2805     !!!parse-error (type => 'no SYSTEM literal');
2806     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2807     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2808 wakaba 1.13 $self->{in_subset} = 1;
2809 wakaba 1.12 !!!next-input-character;
2810 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2811 wakaba 1.12 redo A;
2812 wakaba 1.1 } else {
2813     !!!parse-error (type => 'string after PUBLIC literal');
2814    
2815 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2816     !!!cp (200);
2817     $self->{ct}->{quirks} = 1;
2818     $self->{state} = BOGUS_DOCTYPE_STATE;
2819     } else {
2820     !!!cp (200.2);
2821     $self->{state} = BOGUS_MD_STATE;
2822     }
2823    
2824 wakaba 1.1 !!!next-input-character;
2825     redo A;
2826     }
2827     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2828     if ($is_space->{$self->{nc}}) {
2829     !!!cp (201);
2830     ## Stay in the state
2831     !!!next-input-character;
2832     redo A;
2833     } elsif ($self->{nc} == 0x0022) { # "
2834     !!!cp (202);
2835     $self->{ct}->{sysid} = ''; # DOCTYPE
2836     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2837     !!!next-input-character;
2838     redo A;
2839     } elsif ($self->{nc} == 0x0027) { # '
2840     !!!cp (203);
2841     $self->{ct}->{sysid} = ''; # DOCTYPE
2842     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2843     !!!next-input-character;
2844     redo A;
2845     } elsif ($self->{nc} == 0x003E) { # >
2846     !!!parse-error (type => 'no SYSTEM literal');
2847     !!!next-input-character;
2848    
2849 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850     !!!cp (204);
2851     $self->{state} = DATA_STATE;
2852     $self->{s_kwd} = '';
2853     $self->{ct}->{quirks} = 1;
2854     } else {
2855     !!!cp (204.1);
2856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857     }
2858 wakaba 1.1
2859 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860 wakaba 1.1 redo A;
2861     } elsif ($self->{nc} == -1) {
2862 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2863     !!!cp (205);
2864     !!!parse-error (type => 'unclosed DOCTYPE');
2865     $self->{state} = DATA_STATE;
2866     $self->{s_kwd} = '';
2867     $self->{ct}->{quirks} = 1;
2868     } else {
2869     !!!cp (205.1);
2870     !!!parse-error (type => 'unclosed md'); ## TODO: type
2871     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2872     }
2873    
2874 wakaba 1.1 ## reconsume
2875 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2876 wakaba 1.1 redo A;
2877 wakaba 1.16 } elsif ($self->{is_xml} and
2878     $self->{ct}->{type} == DOCTYPE_TOKEN and
2879     $self->{nc} == 0x005B) { # [
2880 wakaba 1.12 !!!cp (206.1);
2881     !!!parse-error (type => 'no SYSTEM literal');
2882    
2883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2884     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2885 wakaba 1.13 $self->{in_subset} = 1;
2886 wakaba 1.12 !!!next-input-character;
2887 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2888 wakaba 1.12 redo A;
2889 wakaba 1.1 } else {
2890     !!!parse-error (type => 'string after SYSTEM');
2891    
2892 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2893     !!!cp (206);
2894     $self->{ct}->{quirks} = 1;
2895     $self->{state} = BOGUS_DOCTYPE_STATE;
2896     } else {
2897     !!!cp (206.2);
2898     $self->{state} = BOGUS_MD_STATE;
2899     }
2900    
2901 wakaba 1.1 !!!next-input-character;
2902     redo A;
2903     }
2904     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2905     if ($self->{nc} == 0x0022) { # "
2906     !!!cp (207);
2907     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2908     !!!next-input-character;
2909     redo A;
2910 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2911 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2912    
2913 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2914     !!!cp (208);
2915     $self->{state} = DATA_STATE;
2916     $self->{s_kwd} = '';
2917     $self->{ct}->{quirks} = 1;
2918     } else {
2919     !!!cp (208.1);
2920     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2921     }
2922    
2923 wakaba 1.1 !!!next-input-character;
2924 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2925 wakaba 1.1 redo A;
2926     } elsif ($self->{nc} == -1) {
2927     !!!parse-error (type => 'unclosed SYSTEM literal');
2928    
2929 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2930     !!!cp (209);
2931     $self->{state} = DATA_STATE;
2932     $self->{s_kwd} = '';
2933     $self->{ct}->{quirks} = 1;
2934     } else {
2935     !!!cp (209.1);
2936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2937     }
2938    
2939 wakaba 1.1 ## reconsume
2940 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2941 wakaba 1.1 redo A;
2942     } else {
2943     !!!cp (210);
2944 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2945 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2946     length $self->{ct}->{sysid});
2947    
2948     ## Stay in the state
2949     !!!next-input-character;
2950     redo A;
2951     }
2952     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2953     if ($self->{nc} == 0x0027) { # '
2954     !!!cp (211);
2955     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2956     !!!next-input-character;
2957     redo A;
2958 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2959 wakaba 1.1 !!!cp (212);
2960     !!!parse-error (type => 'unclosed SYSTEM literal');
2961    
2962     $self->{state} = DATA_STATE;
2963 wakaba 1.5 $self->{s_kwd} = '';
2964 wakaba 1.1 !!!next-input-character;
2965    
2966     $self->{ct}->{quirks} = 1;
2967     !!!emit ($self->{ct}); # DOCTYPE
2968    
2969     redo A;
2970     } elsif ($self->{nc} == -1) {
2971     !!!parse-error (type => 'unclosed SYSTEM literal');
2972    
2973 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2974     !!!cp (213);
2975     $self->{state} = DATA_STATE;
2976     $self->{s_kwd} = '';
2977     $self->{ct}->{quirks} = 1;
2978     } else {
2979     !!!cp (213.1);
2980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2981     }
2982    
2983 wakaba 1.1 ## reconsume
2984 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2985 wakaba 1.1 redo A;
2986     } else {
2987     !!!cp (214);
2988 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2989 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2990     length $self->{ct}->{sysid});
2991    
2992     ## Stay in the state
2993     !!!next-input-character;
2994     redo A;
2995     }
2996     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2997     if ($is_space->{$self->{nc}}) {
2998 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2999     !!!cp (215.1);
3000     $self->{state} = BEFORE_NDATA_STATE;
3001     } else {
3002     !!!cp (215);
3003     ## Stay in the state
3004     }
3005 wakaba 1.1 !!!next-input-character;
3006     redo A;
3007     } elsif ($self->{nc} == 0x003E) { # >
3008 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3009     !!!cp (216);
3010     $self->{state} = DATA_STATE;
3011     $self->{s_kwd} = '';
3012     } else {
3013     !!!cp (216.1);
3014     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3015     }
3016    
3017 wakaba 1.1 !!!next-input-character;
3018 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3019 wakaba 1.1 redo A;
3020 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3021     ($self->{nc} == 0x004E or # N
3022     $self->{nc} == 0x006E)) { # n
3023     !!!cp (216.2);
3024     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
3025     $self->{state} = NDATA_STATE;
3026     $self->{kwd} = chr $self->{nc};
3027     !!!next-input-character;
3028     redo A;
3029 wakaba 1.1 } elsif ($self->{nc} == -1) {
3030 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3031     !!!cp (217);
3032     !!!parse-error (type => 'unclosed DOCTYPE');
3033     $self->{state} = DATA_STATE;
3034     $self->{s_kwd} = '';
3035     $self->{ct}->{quirks} = 1;
3036     } else {
3037     !!!cp (217.1);
3038     !!!parse-error (type => 'unclosed md'); ## TODO: type
3039     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3040     }
3041    
3042 wakaba 1.1 ## reconsume
3043 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3044 wakaba 1.1 redo A;
3045 wakaba 1.16 } elsif ($self->{is_xml} and
3046     $self->{ct}->{type} == DOCTYPE_TOKEN and
3047     $self->{nc} == 0x005B) { # [
3048 wakaba 1.12 !!!cp (218.1);
3049     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3050     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3051 wakaba 1.13 $self->{in_subset} = 1;
3052 wakaba 1.12 !!!next-input-character;
3053 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
3054 wakaba 1.12 redo A;
3055 wakaba 1.1 } else {
3056     !!!parse-error (type => 'string after SYSTEM literal');
3057    
3058 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3059     !!!cp (218);
3060     #$self->{ct}->{quirks} = 1;
3061     $self->{state} = BOGUS_DOCTYPE_STATE;
3062     } else {
3063     !!!cp (218.2);
3064     $self->{state} = BOGUS_MD_STATE;
3065     }
3066    
3067 wakaba 1.1 !!!next-input-character;
3068     redo A;
3069     }
3070 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
3071     if ($is_space->{$self->{nc}}) {
3072     !!!cp (218.3);
3073     ## Stay in the state.
3074     !!!next-input-character;
3075     redo A;
3076     } elsif ($self->{nc} == 0x003E) { # >
3077     !!!cp (218.4);
3078     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3079     !!!next-input-character;
3080     !!!emit ($self->{ct}); # ENTITY
3081     redo A;
3082     } elsif ($self->{nc} == 0x004E or # N
3083     $self->{nc} == 0x006E) { # n
3084     !!!cp (218.5);
3085     $self->{state} = NDATA_STATE;
3086     $self->{kwd} = chr $self->{nc};
3087     !!!next-input-character;
3088     redo A;
3089     } elsif ($self->{nc} == -1) {
3090     !!!cp (218.6);
3091     !!!parse-error (type => 'unclosed md'); ## TODO: type
3092     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3093     ## reconsume
3094     !!!emit ($self->{ct}); # ENTITY
3095     redo A;
3096     } else {
3097     !!!cp (218.7);
3098     !!!parse-error (type => 'string after SYSTEM literal');
3099     $self->{state} = BOGUS_MD_STATE;
3100     !!!next-input-character;
3101     redo A;
3102     }
3103 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3104     if ($self->{nc} == 0x003E) { # >
3105     !!!cp (219);
3106     $self->{state} = DATA_STATE;
3107 wakaba 1.5 $self->{s_kwd} = '';
3108 wakaba 1.1 !!!next-input-character;
3109    
3110     !!!emit ($self->{ct}); # DOCTYPE
3111    
3112     redo A;
3113 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3114 wakaba 1.13 !!!cp (220.1);
3115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3116     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3117     $self->{in_subset} = 1;
3118     !!!next-input-character;
3119     !!!emit ($self->{ct}); # DOCTYPE
3120     redo A;
3121 wakaba 1.1 } elsif ($self->{nc} == -1) {
3122     !!!cp (220);
3123     $self->{state} = DATA_STATE;
3124 wakaba 1.5 $self->{s_kwd} = '';
3125 wakaba 1.1 ## reconsume
3126    
3127     !!!emit ($self->{ct}); # DOCTYPE
3128    
3129     redo A;
3130     } else {
3131     !!!cp (221);
3132     my $s = '';
3133 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3134 wakaba 1.1
3135     ## Stay in the state
3136     !!!next-input-character;
3137     redo A;
3138     }
3139     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3140     ## NOTE: "CDATA section state" in the state is jointly implemented
3141     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3142     ## and |CDATA_SECTION_MSE2_STATE|.
3143 wakaba 1.10
3144     ## XML5: "CDATA state".
3145 wakaba 1.1
3146     if ($self->{nc} == 0x005D) { # ]
3147     !!!cp (221.1);
3148     $self->{state} = CDATA_SECTION_MSE1_STATE;
3149     !!!next-input-character;
3150     redo A;
3151     } elsif ($self->{nc} == -1) {
3152 wakaba 1.6 if ($self->{is_xml}) {
3153 wakaba 1.8 !!!cp (221.11);
3154 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3155 wakaba 1.8 } else {
3156     !!!cp (221.12);
3157 wakaba 1.6 }
3158    
3159 wakaba 1.1 $self->{state} = DATA_STATE;
3160 wakaba 1.5 $self->{s_kwd} = '';
3161 wakaba 1.10 ## Reconsume.
3162 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3163     !!!cp (221.2);
3164     !!!emit ($self->{ct}); # character
3165     } else {
3166     !!!cp (221.3);
3167     ## No token to emit. $self->{ct} is discarded.
3168     }
3169     redo A;
3170     } else {
3171     !!!cp (221.4);
3172     $self->{ct}->{data} .= chr $self->{nc};
3173     $self->{read_until}->($self->{ct}->{data},
3174     q<]>,
3175     length $self->{ct}->{data});
3176    
3177     ## Stay in the state.
3178     !!!next-input-character;
3179     redo A;
3180     }
3181    
3182     ## ISSUE: "text tokens" in spec.
3183     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3184 wakaba 1.10 ## XML5: "CDATA bracket state".
3185    
3186 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3187     !!!cp (221.5);
3188     $self->{state} = CDATA_SECTION_MSE2_STATE;
3189     !!!next-input-character;
3190     redo A;
3191     } else {
3192     !!!cp (221.6);
3193 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3194 wakaba 1.1 $self->{ct}->{data} .= ']';
3195 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3196 wakaba 1.1 ## Reconsume.
3197     redo A;
3198     }
3199     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3200 wakaba 1.10 ## XML5: "CDATA end state".
3201    
3202 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3203     $self->{state} = DATA_STATE;
3204 wakaba 1.5 $self->{s_kwd} = '';
3205 wakaba 1.1 !!!next-input-character;
3206     if (length $self->{ct}->{data}) { # character
3207     !!!cp (221.7);
3208     !!!emit ($self->{ct}); # character
3209     } else {
3210     !!!cp (221.8);
3211     ## No token to emit. $self->{ct} is discarded.
3212     }
3213     redo A;
3214     } elsif ($self->{nc} == 0x005D) { # ]
3215     !!!cp (221.9); # character
3216     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3217     ## Stay in the state.
3218     !!!next-input-character;
3219     redo A;
3220     } else {
3221     !!!cp (221.11);
3222     $self->{ct}->{data} .= ']]'; # character
3223     $self->{state} = CDATA_SECTION_STATE;
3224 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3225 wakaba 1.1 redo A;
3226     }
3227     } elsif ($self->{state} == ENTITY_STATE) {
3228     if ($is_space->{$self->{nc}} or
3229     {
3230     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3231     $self->{entity_add} => 1,
3232     }->{$self->{nc}}) {
3233 wakaba 1.22 if ($self->{is_xml}) {
3234     !!!cp (1001.1);
3235     !!!parse-error (type => 'bare ero',
3236     line => $self->{line_prev},
3237     column => $self->{column_prev}
3238     + ($self->{nc} == -1 ? 1 : 0));
3239     } else {
3240     !!!cp (1001);
3241     ## No error
3242     }
3243 wakaba 1.1 ## Don't consume
3244     ## Return nothing.
3245     #
3246     } elsif ($self->{nc} == 0x0023) { # #
3247     !!!cp (999);
3248     $self->{state} = ENTITY_HASH_STATE;
3249 wakaba 1.12 $self->{kwd} = '#';
3250 wakaba 1.1 !!!next-input-character;
3251     redo A;
3252 wakaba 1.22 } elsif ($self->{is_xml} or
3253     (0x0041 <= $self->{nc} and
3254 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3255     (0x0061 <= $self->{nc} and
3256     $self->{nc} <= 0x007A)) { # a..z
3257     !!!cp (998);
3258     require Whatpm::_NamedEntityList;
3259     $self->{state} = ENTITY_NAME_STATE;
3260 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3261     $self->{entity__value} = $self->{kwd};
3262 wakaba 1.1 $self->{entity__match} = 0;
3263     !!!next-input-character;
3264     redo A;
3265     } else {
3266     !!!cp (1027);
3267     !!!parse-error (type => 'bare ero');
3268     ## Return nothing.
3269     #
3270     }
3271    
3272     ## NOTE: No character is consumed by the "consume a character
3273     ## reference" algorithm. In other word, there is an "&" character
3274     ## that does not introduce a character reference, which would be
3275     ## appended to the parent element or the attribute value in later
3276     ## process of the tokenizer.
3277    
3278     if ($self->{prev_state} == DATA_STATE) {
3279     !!!cp (997);
3280     $self->{state} = $self->{prev_state};
3281 wakaba 1.5 $self->{s_kwd} = '';
3282 wakaba 1.1 ## Reconsume.
3283     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3284     line => $self->{line_prev},
3285     column => $self->{column_prev},
3286     });
3287     redo A;
3288     } else {
3289     !!!cp (996);
3290     $self->{ca}->{value} .= '&';
3291     $self->{state} = $self->{prev_state};
3292 wakaba 1.5 $self->{s_kwd} = '';
3293 wakaba 1.1 ## Reconsume.
3294     redo A;
3295     }
3296     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3297 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3298 wakaba 1.1 !!!cp (995);
3299     $self->{state} = HEXREF_X_STATE;
3300 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3301 wakaba 1.1 !!!next-input-character;
3302     redo A;
3303 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3304     !!!cp (995.1);
3305     if ($self->{is_xml}) {
3306     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3307     }
3308     $self->{state} = HEXREF_X_STATE;
3309     $self->{kwd} .= chr $self->{nc};
3310     !!!next-input-character;
3311     redo A;
3312 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3313     $self->{nc} <= 0x0039) { # 0..9
3314     !!!cp (994);
3315     $self->{state} = NCR_NUM_STATE;
3316 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3317 wakaba 1.1 !!!next-input-character;
3318     redo A;
3319     } else {
3320     !!!parse-error (type => 'bare nero',
3321     line => $self->{line_prev},
3322     column => $self->{column_prev} - 1);
3323    
3324     ## NOTE: According to the spec algorithm, nothing is returned,
3325     ## and then "&#" is appended to the parent element or the attribute
3326     ## value in the later processing.
3327    
3328     if ($self->{prev_state} == DATA_STATE) {
3329     !!!cp (1019);
3330     $self->{state} = $self->{prev_state};
3331 wakaba 1.5 $self->{s_kwd} = '';
3332 wakaba 1.1 ## Reconsume.
3333     !!!emit ({type => CHARACTER_TOKEN,
3334     data => '&#',
3335     line => $self->{line_prev},
3336     column => $self->{column_prev} - 1,
3337     });
3338     redo A;
3339     } else {
3340     !!!cp (993);
3341     $self->{ca}->{value} .= '&#';
3342     $self->{state} = $self->{prev_state};
3343 wakaba 1.5 $self->{s_kwd} = '';
3344 wakaba 1.1 ## Reconsume.
3345     redo A;
3346     }
3347     }
3348     } elsif ($self->{state} == NCR_NUM_STATE) {
3349     if (0x0030 <= $self->{nc} and
3350     $self->{nc} <= 0x0039) { # 0..9
3351     !!!cp (1012);
3352 wakaba 1.12 $self->{kwd} *= 10;
3353     $self->{kwd} += $self->{nc} - 0x0030;
3354 wakaba 1.1
3355     ## Stay in the state.
3356     !!!next-input-character;
3357     redo A;
3358     } elsif ($self->{nc} == 0x003B) { # ;
3359     !!!cp (1013);
3360     !!!next-input-character;
3361     #
3362     } else {
3363     !!!cp (1014);
3364     !!!parse-error (type => 'no refc');
3365     ## Reconsume.
3366     #
3367     }
3368    
3369 wakaba 1.12 my $code = $self->{kwd};
3370 wakaba 1.1 my $l = $self->{line_prev};
3371     my $c = $self->{column_prev};
3372 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3373     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3374     ($self->{is_xml} and $code == 0x0000)) {
3375 wakaba 1.1 !!!cp (1015);
3376     !!!parse-error (type => 'invalid character reference',
3377     text => (sprintf 'U+%04X', $code),
3378     line => $l, column => $c);
3379     $code = $charref_map->{$code};
3380     } elsif ($code > 0x10FFFF) {
3381     !!!cp (1016);
3382     !!!parse-error (type => 'invalid character reference',
3383     text => (sprintf 'U-%08X', $code),
3384     line => $l, column => $c);
3385     $code = 0xFFFD;
3386     }
3387    
3388     if ($self->{prev_state} == DATA_STATE) {
3389     !!!cp (992);
3390     $self->{state} = $self->{prev_state};
3391 wakaba 1.5 $self->{s_kwd} = '';
3392 wakaba 1.1 ## Reconsume.
3393     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3394 wakaba 1.7 has_reference => 1,
3395 wakaba 1.1 line => $l, column => $c,
3396     });
3397     redo A;
3398     } else {
3399     !!!cp (991);
3400     $self->{ca}->{value} .= chr $code;
3401     $self->{ca}->{has_reference} = 1;
3402     $self->{state} = $self->{prev_state};
3403 wakaba 1.5 $self->{s_kwd} = '';
3404 wakaba 1.1 ## Reconsume.
3405     redo A;
3406     }
3407     } elsif ($self->{state} == HEXREF_X_STATE) {
3408     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3409     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3410     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3411     # 0..9, A..F, a..f
3412     !!!cp (990);
3413     $self->{state} = HEXREF_HEX_STATE;
3414 wakaba 1.12 $self->{kwd} = 0;
3415 wakaba 1.1 ## Reconsume.
3416     redo A;
3417     } else {
3418     !!!parse-error (type => 'bare hcro',
3419     line => $self->{line_prev},
3420     column => $self->{column_prev} - 2);
3421    
3422     ## NOTE: According to the spec algorithm, nothing is returned,
3423     ## and then "&#" followed by "X" or "x" is appended to the parent
3424     ## element or the attribute value in the later processing.
3425    
3426     if ($self->{prev_state} == DATA_STATE) {
3427     !!!cp (1005);
3428     $self->{state} = $self->{prev_state};
3429 wakaba 1.5 $self->{s_kwd} = '';
3430 wakaba 1.1 ## Reconsume.
3431     !!!emit ({type => CHARACTER_TOKEN,
3432 wakaba 1.12 data => '&' . $self->{kwd},
3433 wakaba 1.1 line => $self->{line_prev},
3434 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3435 wakaba 1.1 });
3436     redo A;
3437     } else {
3438     !!!cp (989);
3439 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3440 wakaba 1.1 $self->{state} = $self->{prev_state};
3441 wakaba 1.5 $self->{s_kwd} = '';
3442 wakaba 1.1 ## Reconsume.
3443     redo A;
3444     }
3445     }
3446     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3447     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3448     # 0..9
3449     !!!cp (1002);
3450 wakaba 1.12 $self->{kwd} *= 0x10;
3451     $self->{kwd} += $self->{nc} - 0x0030;
3452 wakaba 1.1 ## Stay in the state.
3453     !!!next-input-character;
3454     redo A;
3455     } elsif (0x0061 <= $self->{nc} and
3456     $self->{nc} <= 0x0066) { # a..f
3457     !!!cp (1003);
3458 wakaba 1.12 $self->{kwd} *= 0x10;
3459     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3460 wakaba 1.1 ## Stay in the state.
3461     !!!next-input-character;
3462     redo A;
3463     } elsif (0x0041 <= $self->{nc} and
3464     $self->{nc} <= 0x0046) { # A..F
3465     !!!cp (1004);
3466 wakaba 1.12 $self->{kwd} *= 0x10;
3467     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3468 wakaba 1.1 ## Stay in the state.
3469     !!!next-input-character;
3470     redo A;
3471     } elsif ($self->{nc} == 0x003B) { # ;
3472     !!!cp (1006);
3473     !!!next-input-character;
3474     #
3475     } else {
3476     !!!cp (1007);
3477     !!!parse-error (type => 'no refc',
3478     line => $self->{line},
3479     column => $self->{column});
3480     ## Reconsume.
3481     #
3482     }
3483    
3484 wakaba 1.12 my $code = $self->{kwd};
3485 wakaba 1.1 my $l = $self->{line_prev};
3486     my $c = $self->{column_prev};
3487 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3488     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3489     ($self->{is_xml} and $code == 0x0000)) {
3490 wakaba 1.1 !!!cp (1008);
3491     !!!parse-error (type => 'invalid character reference',
3492     text => (sprintf 'U+%04X', $code),
3493     line => $l, column => $c);
3494     $code = $charref_map->{$code};
3495     } elsif ($code > 0x10FFFF) {
3496     !!!cp (1009);
3497     !!!parse-error (type => 'invalid character reference',
3498     text => (sprintf 'U-%08X', $code),
3499     line => $l, column => $c);
3500     $code = 0xFFFD;
3501     }
3502    
3503     if ($self->{prev_state} == DATA_STATE) {
3504     !!!cp (988);
3505     $self->{state} = $self->{prev_state};
3506 wakaba 1.5 $self->{s_kwd} = '';
3507 wakaba 1.1 ## Reconsume.
3508     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3509 wakaba 1.7 has_reference => 1,
3510 wakaba 1.1 line => $l, column => $c,
3511     });
3512     redo A;
3513     } else {
3514     !!!cp (987);
3515     $self->{ca}->{value} .= chr $code;
3516     $self->{ca}->{has_reference} = 1;
3517     $self->{state} = $self->{prev_state};
3518 wakaba 1.5 $self->{s_kwd} = '';
3519 wakaba 1.1 ## Reconsume.
3520     redo A;
3521     }
3522     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3523 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3524     $self->{nc} <= 0x005A) or # x
3525     (0x0061 <= $self->{nc} and # a
3526     $self->{nc} <= 0x007A) or # z
3527     (0x0030 <= $self->{nc} and # 0
3528     $self->{nc} <= 0x0039) or # 9
3529 wakaba 1.22 $self->{nc} == 0x003B or # ;
3530     ($self->{is_xml} and
3531     not ($is_space->{$self->{nc}} or
3532     {
3533     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3534     $self->{entity_add} => 1,
3535     }->{$self->{nc}}))) {
3536 wakaba 1.1 our $EntityChar;
3537 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3538 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3539     $self->{ge}->{$self->{kwd}}) {
3540 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3541 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3542     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3543     !!!cp (1020.1);
3544     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3545     } else {
3546     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3547     !!!cp (1020.2);
3548     !!!parse-error (type => 'unparsed entity', ## TODO: type
3549     value => $self->{kwd});
3550     } else {
3551     !!!cp (1020.3);
3552     }
3553     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3554     }
3555     } else {
3556     if ($self->{is_xml}) {
3557     !!!cp (1020.4);
3558     !!!parse-error (type => 'entity not declared', ## TODO: type
3559     value => $self->{kwd},
3560     level => {
3561     'amp;' => $self->{level}->{warn},
3562     'quot;' => $self->{level}->{warn},
3563     'lt;' => $self->{level}->{warn},
3564     'gt;' => $self->{level}->{warn},
3565     'apos;' => $self->{level}->{warn},
3566     }->{$self->{kwd}} ||
3567     $self->{level}->{must});
3568     } else {
3569     !!!cp (1020);
3570     }
3571     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3572     }
3573 wakaba 1.1 $self->{entity__match} = 1;
3574     !!!next-input-character;
3575     #
3576     } else {
3577     !!!cp (1021);
3578 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3579 wakaba 1.1 $self->{entity__match} = -1;
3580     ## Stay in the state.
3581     !!!next-input-character;
3582     redo A;
3583     }
3584     } else {
3585     !!!cp (1022);
3586     $self->{entity__value} .= chr $self->{nc};
3587     $self->{entity__match} *= 2;
3588     ## Stay in the state.
3589     !!!next-input-character;
3590     redo A;
3591     }
3592     }
3593    
3594     my $data;
3595     my $has_ref;
3596     if ($self->{entity__match} > 0) {
3597     !!!cp (1023);
3598     $data = $self->{entity__value};
3599     $has_ref = 1;
3600     #
3601     } elsif ($self->{entity__match} < 0) {
3602     !!!parse-error (type => 'no refc');
3603     if ($self->{prev_state} != DATA_STATE and # in attribute
3604     $self->{entity__match} < -1) {
3605     !!!cp (1024);
3606 wakaba 1.12 $data = '&' . $self->{kwd};
3607 wakaba 1.1 #
3608     } else {
3609     !!!cp (1025);
3610     $data = $self->{entity__value};
3611     $has_ref = 1;
3612     #
3613     }
3614     } else {
3615     !!!cp (1026);
3616     !!!parse-error (type => 'bare ero',
3617     line => $self->{line_prev},
3618 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3619     $data = '&' . $self->{kwd};
3620 wakaba 1.1 #
3621     }
3622    
3623     ## NOTE: In these cases, when a character reference is found,
3624     ## it is consumed and a character token is returned, or, otherwise,
3625     ## nothing is consumed and returned, according to the spec algorithm.
3626     ## In this implementation, anything that has been examined by the
3627     ## tokenizer is appended to the parent element or the attribute value
3628     ## as string, either literal string when no character reference or
3629     ## entity-replaced string otherwise, in this stage, since any characters
3630     ## that would not be consumed are appended in the data state or in an
3631     ## appropriate attribute value state anyway.
3632    
3633     if ($self->{prev_state} == DATA_STATE) {
3634     !!!cp (986);
3635     $self->{state} = $self->{prev_state};
3636 wakaba 1.5 $self->{s_kwd} = '';
3637 wakaba 1.1 ## Reconsume.
3638     !!!emit ({type => CHARACTER_TOKEN,
3639     data => $data,
3640 wakaba 1.7 has_reference => $has_ref,
3641 wakaba 1.1 line => $self->{line_prev},
3642 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3643 wakaba 1.1 });
3644     redo A;
3645     } else {
3646     !!!cp (985);
3647     $self->{ca}->{value} .= $data;
3648     $self->{ca}->{has_reference} = 1 if $has_ref;
3649     $self->{state} = $self->{prev_state};
3650 wakaba 1.5 $self->{s_kwd} = '';
3651 wakaba 1.1 ## Reconsume.
3652     redo A;
3653     }
3654 wakaba 1.8
3655     ## XML-only states
3656    
3657     } elsif ($self->{state} == PI_STATE) {
3658 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3659    
3660 wakaba 1.8 if ($is_space->{$self->{nc}} or
3661 wakaba 1.14 $self->{nc} == 0x003F or # ?
3662 wakaba 1.8 $self->{nc} == -1) {
3663 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3664     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3665     ## "DOCTYPE pi state": Parse error, switch to the "data
3666     ## state".
3667 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3668     line => $self->{line_prev},
3669     column => $self->{column_prev}
3670     - 1 * ($self->{nc} != -1));
3671     $self->{state} = BOGUS_COMMENT_STATE;
3672     ## Reconsume.
3673     $self->{ct} = {type => COMMENT_TOKEN,
3674     data => '?',
3675     line => $self->{line_prev},
3676     column => $self->{column_prev}
3677     - 1 * ($self->{nc} != -1),
3678     };
3679     redo A;
3680     } else {
3681 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3682 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3683     target => chr $self->{nc},
3684     data => '',
3685     line => $self->{line_prev},
3686     column => $self->{column_prev} - 1,
3687     };
3688     $self->{state} = PI_TARGET_STATE;
3689     !!!next-input-character;
3690     redo A;
3691     }
3692     } elsif ($self->{state} == PI_TARGET_STATE) {
3693     if ($is_space->{$self->{nc}}) {
3694     $self->{state} = PI_TARGET_AFTER_STATE;
3695     !!!next-input-character;
3696     redo A;
3697     } elsif ($self->{nc} == -1) {
3698     !!!parse-error (type => 'no pic'); ## TODO: type
3699 wakaba 1.13 if ($self->{in_subset}) {
3700     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3701     } else {
3702     $self->{state} = DATA_STATE;
3703     $self->{s_kwd} = '';
3704     }
3705 wakaba 1.8 ## Reconsume.
3706     !!!emit ($self->{ct}); # pi
3707     redo A;
3708     } elsif ($self->{nc} == 0x003F) { # ?
3709     $self->{state} = PI_AFTER_STATE;
3710     !!!next-input-character;
3711     redo A;
3712     } else {
3713     ## XML5: typo ("tag name" -> "target")
3714     $self->{ct}->{target} .= chr $self->{nc}; # pi
3715     !!!next-input-character;
3716     redo A;
3717     }
3718     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3719     if ($is_space->{$self->{nc}}) {
3720     ## Stay in the state.
3721     !!!next-input-character;
3722     redo A;
3723     } else {
3724     $self->{state} = PI_DATA_STATE;
3725     ## Reprocess.
3726     redo A;
3727     }
3728     } elsif ($self->{state} == PI_DATA_STATE) {
3729     if ($self->{nc} == 0x003F) { # ?
3730     $self->{state} = PI_DATA_AFTER_STATE;
3731     !!!next-input-character;
3732     redo A;
3733     } elsif ($self->{nc} == -1) {
3734     !!!parse-error (type => 'no pic'); ## TODO: type
3735 wakaba 1.13 if ($self->{in_subset}) {
3736 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3737 wakaba 1.13 } else {
3738     $self->{state} = DATA_STATE;
3739     $self->{s_kwd} = '';
3740     }
3741 wakaba 1.8 ## Reprocess.
3742     !!!emit ($self->{ct}); # pi
3743     redo A;
3744     } else {
3745     $self->{ct}->{data} .= chr $self->{nc}; # pi
3746     $self->{read_until}->($self->{ct}->{data}, q[?],
3747     length $self->{ct}->{data});
3748     ## Stay in the state.
3749     !!!next-input-character;
3750     ## Reprocess.
3751     redo A;
3752     }
3753     } elsif ($self->{state} == PI_AFTER_STATE) {
3754 wakaba 1.14 ## XML5: Part of "Pi after state".
3755    
3756 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3757 wakaba 1.13 if ($self->{in_subset}) {
3758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3759     } else {
3760     $self->{state} = DATA_STATE;
3761     $self->{s_kwd} = '';
3762     }
3763 wakaba 1.8 !!!next-input-character;
3764     !!!emit ($self->{ct}); # pi
3765     redo A;
3766     } elsif ($self->{nc} == 0x003F) { # ?
3767     !!!parse-error (type => 'no s after target', ## TODO: type
3768     line => $self->{line_prev},
3769     column => $self->{column_prev}); ## XML5: no error
3770     $self->{ct}->{data} .= '?';
3771     $self->{state} = PI_DATA_AFTER_STATE;
3772     !!!next-input-character;
3773     redo A;
3774     } else {
3775     !!!parse-error (type => 'no s after target', ## TODO: type
3776     line => $self->{line_prev},
3777     column => $self->{column_prev}
3778     + 1 * ($self->{nc} == -1)); ## XML5: no error
3779     $self->{ct}->{data} .= '?'; ## XML5: not appended
3780     $self->{state} = PI_DATA_STATE;
3781     ## Reprocess.
3782     redo A;
3783     }
3784     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3785 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3786    
3787 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3788 wakaba 1.13 if ($self->{in_subset}) {
3789     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3790     } else {
3791     $self->{state} = DATA_STATE;
3792     $self->{s_kwd} = '';
3793     }
3794 wakaba 1.8 !!!next-input-character;
3795     !!!emit ($self->{ct}); # pi
3796     redo A;
3797     } elsif ($self->{nc} == 0x003F) { # ?
3798     $self->{ct}->{data} .= '?';
3799     ## Stay in the state.
3800     !!!next-input-character;
3801     redo A;
3802     } else {
3803     $self->{ct}->{data} .= '?'; ## XML5: not appended
3804     $self->{state} = PI_DATA_STATE;
3805     ## Reprocess.
3806     redo A;
3807     }
3808 wakaba 1.12
3809     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3810     if ($self->{nc} == 0x003C) { # <
3811 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3812 wakaba 1.12 !!!next-input-character;
3813     redo A;
3814     } elsif ($self->{nc} == 0x0025) { # %
3815     ## XML5: Not defined yet.
3816    
3817     ## TODO:
3818 wakaba 1.24
3819     if (not $self->{stop_processing} and
3820     not $self->{document}->xml_standalone) {
3821     !!!parse-error (type => 'stop processing', ## TODO: type
3822     level => $self->{level}->{info});
3823     $self->{stop_processing} = 1;
3824     }
3825    
3826 wakaba 1.12 !!!next-input-character;
3827     redo A;
3828     } elsif ($self->{nc} == 0x005D) { # ]
3829 wakaba 1.13 delete $self->{in_subset};
3830 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3831     !!!next-input-character;
3832     redo A;
3833     } elsif ($is_space->{$self->{nc}}) {
3834     ## Stay in the state.
3835     !!!next-input-character;
3836     redo A;
3837     } elsif ($self->{nc} == -1) {
3838     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3839 wakaba 1.13 delete $self->{in_subset};
3840 wakaba 1.12 $self->{state} = DATA_STATE;
3841     $self->{s_kwd} = '';
3842     ## Reconsume.
3843 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3844 wakaba 1.12 redo A;
3845     } else {
3846     unless ($self->{internal_subset_tainted}) {
3847     ## XML5: No parse error.
3848     !!!parse-error (type => 'string in internal subset');
3849     $self->{internal_subset_tainted} = 1;
3850     }
3851     ## Stay in the state.
3852     !!!next-input-character;
3853     redo A;
3854     }
3855     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3856     if ($self->{nc} == 0x003E) { # >
3857     $self->{state} = DATA_STATE;
3858     $self->{s_kwd} = '';
3859     !!!next-input-character;
3860 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3861 wakaba 1.12 redo A;
3862     } elsif ($self->{nc} == -1) {
3863     !!!parse-error (type => 'unclosed DOCTYPE');
3864     $self->{state} = DATA_STATE;
3865     $self->{s_kwd} = '';
3866     ## Reconsume.
3867 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3868 wakaba 1.12 redo A;
3869     } else {
3870     ## XML5: No parse error and stay in the state.
3871     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3872    
3873 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3874     !!!next-input-character;
3875     redo A;
3876     }
3877     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3878     if ($self->{nc} == 0x003E) { # >
3879     $self->{state} = DATA_STATE;
3880     $self->{s_kwd} = '';
3881     !!!next-input-character;
3882     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3883     redo A;
3884     } elsif ($self->{nc} == -1) {
3885     $self->{state} = DATA_STATE;
3886     $self->{s_kwd} = '';
3887     ## Reconsume.
3888     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3889     redo A;
3890     } else {
3891     ## Stay in the state.
3892     !!!next-input-character;
3893     redo A;
3894     }
3895     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3896     if ($self->{nc} == 0x0021) { # !
3897 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3898 wakaba 1.13 !!!next-input-character;
3899     redo A;
3900     } elsif ($self->{nc} == 0x003F) { # ?
3901     $self->{state} = PI_STATE;
3902     !!!next-input-character;
3903     redo A;
3904     } elsif ($self->{nc} == -1) {
3905     !!!parse-error (type => 'bare stago');
3906     $self->{state} = DATA_STATE;
3907     $self->{s_kwd} = '';
3908     ## Reconsume.
3909     redo A;
3910     } else {
3911     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3912     line => $self->{line_prev},
3913     column => $self->{column_prev});
3914     $self->{state} = BOGUS_COMMENT_STATE;
3915     $self->{ct} = {type => COMMENT_TOKEN,
3916     data => '',
3917     }; ## NOTE: Will be discarded.
3918 wakaba 1.12 !!!next-input-character;
3919     redo A;
3920     }
3921 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3922     ## XML5: "DOCTYPE markup declaration state".
3923    
3924     if ($self->{nc} == 0x002D) { # -
3925     $self->{state} = MD_HYPHEN_STATE;
3926     !!!next-input-character;
3927     redo A;
3928 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3929     $self->{nc} == 0x0065) { # e
3930 wakaba 1.14 $self->{state} = MD_E_STATE;
3931     $self->{kwd} = chr $self->{nc};
3932     !!!next-input-character;
3933     redo A;
3934 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3935     $self->{nc} == 0x0061) { # a
3936 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3937     $self->{kwd} = chr $self->{nc};
3938     !!!next-input-character;
3939     redo A;
3940 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3941     $self->{nc} == 0x006E) { # n
3942 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3943     $self->{kwd} = chr $self->{nc};
3944     !!!next-input-character;
3945     redo A;
3946     } else {
3947     #
3948     }
3949    
3950     ## XML5: No parse error.
3951     !!!parse-error (type => 'bogus comment',
3952     line => $self->{line_prev},
3953     column => $self->{column_prev} - 1);
3954     ## Reconsume.
3955     $self->{state} = BOGUS_COMMENT_STATE;
3956     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3957     redo A;
3958     } elsif ($self->{state} == MD_E_STATE) {
3959 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3960     $self->{nc} == 0x006E) { # n
3961 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3962     $self->{kwd} .= chr $self->{nc};
3963     !!!next-input-character;
3964     redo A;
3965 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3966     $self->{nc} == 0x006C) { # l
3967 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3968     $self->{state} = MD_ELEMENT_STATE;
3969     $self->{kwd} .= chr $self->{nc};
3970     !!!next-input-character;
3971     redo A;
3972     } else {
3973     ## XML5: No parse error.
3974     !!!parse-error (type => 'bogus comment',
3975     line => $self->{line_prev},
3976     column => $self->{column_prev} - 2
3977     + 1 * ($self->{nc} == -1));
3978     ## Reconsume.
3979     $self->{state} = BOGUS_COMMENT_STATE;
3980     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3981     redo A;
3982     }
3983     } elsif ($self->{state} == MD_ENTITY_STATE) {
3984 wakaba 1.17 if ($self->{nc} == [
3985     undef,
3986     undef,
3987     0x0054, # T
3988     0x0049, # I
3989     0x0054, # T
3990     ]->[length $self->{kwd}] or
3991     $self->{nc} == [
3992     undef,
3993     undef,
3994     0x0074, # t
3995     0x0069, # i
3996     0x0074, # t
3997     ]->[length $self->{kwd}]) {
3998 wakaba 1.14 ## Stay in the state.
3999     $self->{kwd} .= chr $self->{nc};
4000     !!!next-input-character;
4001     redo A;
4002 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
4003     ($self->{nc} == 0x0059 or # Y
4004     $self->{nc} == 0x0079)) { # y
4005     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
4006     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4007     text => 'ENTITY',
4008     line => $self->{line_prev},
4009     column => $self->{column_prev} - 4);
4010     }
4011     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
4012 wakaba 1.14 line => $self->{line_prev},
4013     column => $self->{column_prev} - 6};
4014     $self->{state} = DOCTYPE_MD_STATE;
4015     !!!next-input-character;
4016     redo A;
4017     } else {
4018     !!!parse-error (type => 'bogus comment',
4019     line => $self->{line_prev},
4020     column => $self->{column_prev} - 1
4021     - (length $self->{kwd})
4022     + 1 * ($self->{nc} == -1));
4023     $self->{state} = BOGUS_COMMENT_STATE;
4024     ## Reconsume.
4025     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4026     redo A;
4027     }
4028     } elsif ($self->{state} == MD_ELEMENT_STATE) {
4029 wakaba 1.17 if ($self->{nc} == [
4030     undef,
4031     undef,
4032     0x0045, # E
4033     0x004D, # M
4034     0x0045, # E
4035     0x004E, # N
4036     ]->[length $self->{kwd}] or
4037     $self->{nc} == [
4038     undef,
4039     undef,
4040     0x0065, # e
4041     0x006D, # m
4042     0x0065, # e
4043     0x006E, # n
4044     ]->[length $self->{kwd}]) {
4045 wakaba 1.14 ## Stay in the state.
4046     $self->{kwd} .= chr $self->{nc};
4047     !!!next-input-character;
4048     redo A;
4049 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4050     ($self->{nc} == 0x0054 or # T
4051     $self->{nc} == 0x0074)) { # t
4052     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
4053     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4054     text => 'ELEMENT',
4055     line => $self->{line_prev},
4056     column => $self->{column_prev} - 5);
4057     }
4058 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
4059     line => $self->{line_prev},
4060 wakaba 1.23 column => $self->{column_prev} - 7};
4061 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4062     !!!next-input-character;
4063     redo A;
4064     } else {
4065     !!!parse-error (type => 'bogus comment',
4066     line => $self->{line_prev},
4067     column => $self->{column_prev} - 1
4068     - (length $self->{kwd})
4069     + 1 * ($self->{nc} == -1));
4070     $self->{state} = BOGUS_COMMENT_STATE;
4071     ## Reconsume.
4072     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4073     redo A;
4074     }
4075     } elsif ($self->{state} == MD_ATTLIST_STATE) {
4076 wakaba 1.17 if ($self->{nc} == [
4077     undef,
4078     0x0054, # T
4079     0x0054, # T
4080     0x004C, # L
4081     0x0049, # I
4082     0x0053, # S
4083     ]->[length $self->{kwd}] or
4084     $self->{nc} == [
4085     undef,
4086     0x0074, # t
4087     0x0074, # t
4088     0x006C, # l
4089     0x0069, # i
4090     0x0073, # s
4091     ]->[length $self->{kwd}]) {
4092 wakaba 1.14 ## Stay in the state.
4093     $self->{kwd} .= chr $self->{nc};
4094     !!!next-input-character;
4095     redo A;
4096 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4097     ($self->{nc} == 0x0054 or # T
4098     $self->{nc} == 0x0074)) { # t
4099     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4100     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4101     text => 'ATTLIST',
4102     line => $self->{line_prev},
4103     column => $self->{column_prev} - 5);
4104     }
4105 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4106 wakaba 1.15 attrdefs => [],
4107 wakaba 1.14 line => $self->{line_prev},
4108 wakaba 1.23 column => $self->{column_prev} - 7};
4109 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4110     !!!next-input-character;
4111     redo A;
4112     } else {
4113     !!!parse-error (type => 'bogus comment',
4114     line => $self->{line_prev},
4115     column => $self->{column_prev} - 1
4116     - (length $self->{kwd})
4117     + 1 * ($self->{nc} == -1));
4118     $self->{state} = BOGUS_COMMENT_STATE;
4119     ## Reconsume.
4120     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4121     redo A;
4122     }
4123     } elsif ($self->{state} == MD_NOTATION_STATE) {
4124 wakaba 1.17 if ($self->{nc} == [
4125     undef,
4126     0x004F, # O
4127     0x0054, # T
4128     0x0041, # A
4129     0x0054, # T
4130     0x0049, # I
4131     0x004F, # O
4132     ]->[length $self->{kwd}] or
4133     $self->{nc} == [
4134     undef,
4135     0x006F, # o
4136     0x0074, # t
4137     0x0061, # a
4138     0x0074, # t
4139     0x0069, # i
4140     0x006F, # o
4141     ]->[length $self->{kwd}]) {
4142 wakaba 1.14 ## Stay in the state.
4143     $self->{kwd} .= chr $self->{nc};
4144     !!!next-input-character;
4145     redo A;
4146 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4147     ($self->{nc} == 0x004E or # N
4148     $self->{nc} == 0x006E)) { # n
4149     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4150     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4151     text => 'NOTATION',
4152     line => $self->{line_prev},
4153     column => $self->{column_prev} - 6);
4154     }
4155 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4156     line => $self->{line_prev},
4157 wakaba 1.23 column => $self->{column_prev} - 8};
4158 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4159     !!!next-input-character;
4160     redo A;
4161     } else {
4162     !!!parse-error (type => 'bogus comment',
4163     line => $self->{line_prev},
4164     column => $self->{column_prev} - 1
4165     - (length $self->{kwd})
4166     + 1 * ($self->{nc} == -1));
4167     $self->{state} = BOGUS_COMMENT_STATE;
4168     ## Reconsume.
4169     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4170     redo A;
4171     }
4172     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4173     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4174     ## "DOCTYPE NOTATION state".
4175    
4176     if ($is_space->{$self->{nc}}) {
4177     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4178     $self->{state} = BEFORE_MD_NAME_STATE;
4179     !!!next-input-character;
4180     redo A;
4181     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4182     $self->{nc} == 0x0025) { # %
4183     ## XML5: Switch to the "DOCTYPE bogus comment state".
4184     !!!parse-error (type => 'no space before md name'); ## TODO: type
4185     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4186     !!!next-input-character;
4187     redo A;
4188     } elsif ($self->{nc} == -1) {
4189     !!!parse-error (type => 'unclosed md'); ## TODO: type
4190     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4191     ## Reconsume.
4192     redo A;
4193     } elsif ($self->{nc} == 0x003E) { # >
4194     ## XML5: Switch to the "DOCTYPE bogus comment state".
4195     !!!parse-error (type => 'no md name'); ## TODO: type
4196     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197     !!!next-input-character;
4198     redo A;
4199     } else {
4200     ## XML5: Switch to the "DOCTYPE bogus comment state".
4201     !!!parse-error (type => 'no space before md name'); ## TODO: type
4202     $self->{state} = BEFORE_MD_NAME_STATE;
4203     redo A;
4204     }
4205     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4206     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4207     ## before state", "DOCTYPE ATTLIST name before state".
4208    
4209     if ($is_space->{$self->{nc}}) {
4210     ## Stay in the state.
4211     !!!next-input-character;
4212     redo A;
4213     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4214     $self->{nc} == 0x0025) { # %
4215     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4216     !!!next-input-character;
4217     redo A;
4218     } elsif ($self->{nc} == 0x003E) { # >
4219     ## XML5: Same as "Anything else".
4220     !!!parse-error (type => 'no md name'); ## TODO: type
4221     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4222     !!!next-input-character;
4223     redo A;
4224     } elsif ($self->{nc} == -1) {
4225     !!!parse-error (type => 'unclosed md'); ## TODO: type
4226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4227     ## Reconsume.
4228     redo A;
4229     } else {
4230     ## XML5: [ATTLIST] Not defined yet.
4231     $self->{ct}->{name} .= chr $self->{nc};
4232     $self->{state} = MD_NAME_STATE;
4233     !!!next-input-character;
4234     redo A;
4235     }
4236     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4237     if ($is_space->{$self->{nc}}) {
4238     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4239     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4240     $self->{state} = BEFORE_MD_NAME_STATE;
4241     !!!next-input-character;
4242     redo A;
4243     } elsif ($self->{nc} == 0x003E) { # >
4244     ## XML5: Same as "Anything else".
4245     !!!parse-error (type => 'no md name'); ## TODO: type
4246     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247     !!!next-input-character;
4248     redo A;
4249     } elsif ($self->{nc} == -1) {
4250     !!!parse-error (type => 'unclosed md');
4251     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4252     ## Reconsume.
4253     redo A;
4254     } else {
4255     ## XML5: No parse error.
4256     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4257     $self->{state} = BOGUS_COMMENT_STATE;
4258     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4259     ## Reconsume.
4260     redo A;
4261     }
4262     } elsif ($self->{state} == MD_NAME_STATE) {
4263     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4264    
4265     if ($is_space->{$self->{nc}}) {
4266 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4267     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4268     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4269 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4270 wakaba 1.16 } else { # ENTITY/NOTATION
4271     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4272     }
4273 wakaba 1.14 !!!next-input-character;
4274     redo A;
4275     } elsif ($self->{nc} == 0x003E) { # >
4276     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4277     #
4278     } else {
4279 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4280 wakaba 1.14 }
4281     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4282     !!!next-input-character;
4283     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4284     redo A;
4285     } elsif ($self->{nc} == -1) {
4286     ## XML5: [ATTLIST] No parse error.
4287     !!!parse-error (type => 'unclosed md');
4288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4289     ## Reconsume.
4290     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4291     redo A;
4292     } else {
4293     ## XML5: [ATTLIST] Not defined yet.
4294     $self->{ct}->{name} .= chr $self->{nc};
4295     ## Stay in the state.
4296     !!!next-input-character;
4297     redo A;
4298     }
4299     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4300     if ($is_space->{$self->{nc}}) {
4301     ## Stay in the state.
4302     !!!next-input-character;
4303     redo A;
4304     } elsif ($self->{nc} == 0x003E) { # >
4305     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4306     !!!next-input-character;
4307     !!!emit ($self->{ct}); # ATTLIST
4308     redo A;
4309     } elsif ($self->{nc} == -1) {
4310     ## XML5: No parse error.
4311     !!!parse-error (type => 'unclosed md'); ## TODO: type
4312     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4313 wakaba 1.15 !!!emit ($self->{ct});
4314     redo A;
4315     } else {
4316     ## XML5: Not defined yet.
4317     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4318     tokens => [],
4319     line => $self->{line}, column => $self->{column}};
4320     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4321     !!!next-input-character;
4322     redo A;
4323     }
4324     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4325     if ($is_space->{$self->{nc}}) {
4326     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4327     !!!next-input-character;
4328     redo A;
4329     } elsif ($self->{nc} == 0x003E) { # >
4330     ## XML5: Same as "anything else".
4331     !!!parse-error (type => 'no attr type'); ## TODO: type
4332     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4333     !!!next-input-character;
4334     !!!emit ($self->{ct}); # ATTLIST
4335     redo A;
4336     } elsif ($self->{nc} == 0x0028) { # (
4337     ## XML5: Same as "anything else".
4338     !!!parse-error (type => 'no space before paren'); ## TODO: type
4339     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4340     !!!next-input-character;
4341     redo A;
4342     } elsif ($self->{nc} == -1) {
4343     ## XML5: No parse error.
4344     !!!parse-error (type => 'unclosed md'); ## TODO: type
4345     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4346     !!!next-input-character;
4347     !!!emit ($self->{ct}); # ATTLIST
4348     redo A;
4349     } else {
4350     ## XML5: Not defined yet.
4351     $self->{ca}->{name} .= chr $self->{nc};
4352     ## Stay in the state.
4353     !!!next-input-character;
4354     redo A;
4355     }
4356     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4357     if ($is_space->{$self->{nc}}) {
4358     ## Stay in the state.
4359     !!!next-input-character;
4360     redo A;
4361     } elsif ($self->{nc} == 0x003E) { # >
4362     ## XML5: Same as "anything else".
4363     !!!parse-error (type => 'no attr type'); ## TODO: type
4364     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4365     !!!next-input-character;
4366     !!!emit ($self->{ct}); # ATTLIST
4367     redo A;
4368     } elsif ($self->{nc} == 0x0028) { # (
4369     ## XML5: Same as "anything else".
4370     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4371     !!!next-input-character;
4372     redo A;
4373     } elsif ($self->{nc} == -1) {
4374     ## XML5: No parse error.
4375     !!!parse-error (type => 'unclosed md'); ## TODO: type
4376     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4377     !!!next-input-character;
4378     !!!emit ($self->{ct});
4379 wakaba 1.14 redo A;
4380     } else {
4381     ## XML5: Not defined yet.
4382 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4383     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4384     !!!next-input-character;
4385     redo A;
4386     }
4387     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4388     if ($is_space->{$self->{nc}}) {
4389     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4390     !!!next-input-character;
4391     redo A;
4392     } elsif ($self->{nc} == 0x0023) { # #
4393     ## XML5: Same as "anything else".
4394     !!!parse-error (type => 'no space before default value'); ## TODO: type
4395     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4396     !!!next-input-character;
4397     redo A;
4398     } elsif ($self->{nc} == 0x0022) { # "
4399     ## XML5: Same as "anything else".
4400     !!!parse-error (type => 'no space before default value'); ## TODO: type
4401     $self->{ca}->{value} = '';
4402     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4403     !!!next-input-character;
4404     redo A;
4405     } elsif ($self->{nc} == 0x0027) { # '
4406     ## XML5: Same as "anything else".
4407     !!!parse-error (type => 'no space before default value'); ## TODO: type
4408     $self->{ca}->{value} = '';
4409     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4410     !!!next-input-character;
4411     redo A;
4412     } elsif ($self->{nc} == 0x003E) { # >
4413     ## XML5: Same as "anything else".
4414     !!!parse-error (type => 'no attr default'); ## TODO: type
4415     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4416     !!!next-input-character;
4417     !!!emit ($self->{ct}); # ATTLIST
4418     redo A;
4419     } elsif ($self->{nc} == 0x0028) { # (
4420     ## XML5: Same as "anything else".
4421     !!!parse-error (type => 'no space before paren'); ## TODO: type
4422     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4423     !!!next-input-character;
4424     redo A;
4425     } elsif ($self->{nc} == -1) {
4426     ## XML5: No parse error.
4427     !!!parse-error (type => 'unclosed md'); ## TODO: type
4428     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4429     !!!next-input-character;
4430     !!!emit ($self->{ct});
4431     redo A;
4432     } else {
4433     ## XML5: Not defined yet.
4434     $self->{ca}->{type} .= chr $self->{nc};
4435     ## Stay in the state.
4436     !!!next-input-character;
4437     redo A;
4438     }
4439     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4440     if ($is_space->{$self->{nc}}) {
4441     ## Stay in the state.
4442     !!!next-input-character;
4443     redo A;
4444     } elsif ($self->{nc} == 0x0028) { # (
4445     ## XML5: Same as "anything else".
4446     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4447     !!!next-input-character;
4448     redo A;
4449     } elsif ($self->{nc} == 0x0023) { # #
4450     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4451     !!!next-input-character;
4452     redo A;
4453     } elsif ($self->{nc} == 0x0022) { # "
4454     ## XML5: Same as "anything else".
4455     $self->{ca}->{value} = '';
4456     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4457     !!!next-input-character;
4458     redo A;
4459     } elsif ($self->{nc} == 0x0027) { # '
4460     ## XML5: Same as "anything else".
4461     $self->{ca}->{value} = '';
4462     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4463     !!!next-input-character;
4464     redo A;
4465     } elsif ($self->{nc} == 0x003E) { # >
4466     ## XML5: Same as "anything else".
4467     !!!parse-error (type => 'no attr default'); ## TODO: type
4468     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4469     !!!next-input-character;
4470     !!!emit ($self->{ct}); # ATTLIST
4471     redo A;
4472     } elsif ($self->{nc} == -1) {
4473     ## XML5: No parse error.
4474     !!!parse-error (type => 'unclosed md'); ## TODO: type
4475     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4476     !!!next-input-character;
4477     !!!emit ($self->{ct});
4478     redo A;
4479     } else {
4480     ## XML5: Switch to the "DOCTYPE bogus comment state".
4481     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4482     $self->{ca}->{value} = '';
4483     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4484     ## Reconsume.
4485     redo A;
4486     }
4487     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4488     if ($is_space->{$self->{nc}}) {
4489     ## Stay in the state.
4490     !!!next-input-character;
4491     redo A;
4492     } elsif ($self->{nc} == 0x007C) { # |
4493     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4494     ## Stay in the state.
4495     !!!next-input-character;
4496     redo A;
4497     } elsif ($self->{nc} == 0x0029) { # )
4498     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4499     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4500     !!!next-input-character;
4501     redo A;
4502     } elsif ($self->{nc} == 0x003E) { # >
4503     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4504     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505     !!!next-input-character;
4506     !!!emit ($self->{ct}); # ATTLIST
4507     redo A;
4508     } elsif ($self->{nc} == -1) {
4509     ## XML5: No parse error.
4510     !!!parse-error (type => 'unclosed md'); ## TODO: type
4511     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4512     !!!next-input-character;
4513     !!!emit ($self->{ct});
4514     redo A;
4515     } else {
4516     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4517     $self->{state} = ALLOWED_TOKEN_STATE;
4518     !!!next-input-character;
4519     redo A;
4520     }
4521     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4522     if ($is_space->{$self->{nc}}) {
4523     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4524     !!!next-input-character;
4525     redo A;
4526     } elsif ($self->{nc} == 0x007C) { # |
4527     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4528     !!!next-input-character;
4529     redo A;
4530     } elsif ($self->{nc} == 0x0029) { # )
4531     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4532     !!!next-input-character;
4533     redo A;
4534     } elsif ($self->{nc} == 0x003E) { # >
4535     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4536     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4537     !!!next-input-character;
4538     !!!emit ($self->{ct}); # ATTLIST
4539     redo A;
4540     } elsif ($self->{nc} == -1) {
4541     ## XML5: No parse error.
4542     !!!parse-error (type => 'unclosed md'); ## TODO: type
4543     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4544     !!!next-input-character;
4545     !!!emit ($self->{ct});
4546     redo A;
4547     } else {
4548     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4549     ## Stay in the state.
4550     !!!next-input-character;
4551     redo A;
4552     }
4553     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4554     if ($is_space->{$self->{nc}}) {
4555     ## Stay in the state.
4556     !!!next-input-character;
4557     redo A;
4558     } elsif ($self->{nc} == 0x007C) { # |
4559     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4560     !!!next-input-character;
4561     redo A;
4562     } elsif ($self->{nc} == 0x0029) { # )
4563     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4564     !!!next-input-character;
4565     redo A;
4566     } elsif ($self->{nc} == 0x003E) { # >
4567     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4568     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4569     !!!next-input-character;
4570     !!!emit ($self->{ct}); # ATTLIST
4571     redo A;
4572     } elsif ($self->{nc} == -1) {
4573     ## XML5: No parse error.
4574     !!!parse-error (type => 'unclosed md'); ## TODO: type
4575     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4576     !!!next-input-character;
4577     !!!emit ($self->{ct});
4578     redo A;
4579     } else {
4580     !!!parse-error (type => 'space in allowed token', ## TODO: type
4581     line => $self->{line_prev},
4582     column => $self->{column_prev});
4583     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4584     $self->{state} = ALLOWED_TOKEN_STATE;
4585     !!!next-input-character;
4586     redo A;
4587     }
4588     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4589     if ($is_space->{$self->{nc}}) {
4590     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4591     !!!next-input-character;
4592     redo A;
4593     } elsif ($self->{nc} == 0x0023) { # #
4594     !!!parse-error (type => 'no space before default value'); ## TODO: type
4595     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4596     !!!next-input-character;
4597     redo A;
4598     } elsif ($self->{nc} == 0x0022) { # "
4599     !!!parse-error (type => 'no space before default value'); ## TODO: type
4600     $self->{ca}->{value} = '';
4601     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4602     !!!next-input-character;
4603     redo A;
4604     } elsif ($self->{nc} == 0x0027) { # '
4605     !!!parse-error (type => 'no space before default value'); ## TODO: type
4606     $self->{ca}->{value} = '';
4607     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4608     !!!next-input-character;
4609     redo A;
4610     } elsif ($self->{nc} == 0x003E) { # >
4611     !!!parse-error (type => 'no attr default'); ## TODO: type
4612     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4613     !!!next-input-character;
4614     !!!emit ($self->{ct}); # ATTLIST
4615     redo A;
4616     } elsif ($self->{nc} == -1) {
4617     !!!parse-error (type => 'unclosed md'); ## TODO: type
4618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4619     !!!next-input-character;
4620     !!!emit ($self->{ct});
4621     redo A;
4622     } else {
4623     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4624     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4625     ## Reconsume.
4626     redo A;
4627     }
4628     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4629     if ($is_space->{$self->{nc}}) {
4630     ## Stay in the state.
4631     !!!next-input-character;
4632     redo A;
4633     } elsif ($self->{nc} == 0x0023) { # #
4634     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4635     !!!next-input-character;
4636     redo A;
4637     } elsif ($self->{nc} == 0x0022) { # "
4638     $self->{ca}->{value} = '';
4639     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4640     !!!next-input-character;
4641     redo A;
4642     } elsif ($self->{nc} == 0x0027) { # '
4643     $self->{ca}->{value} = '';
4644     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4645     !!!next-input-character;
4646     redo A;
4647     } elsif ($self->{nc} == 0x003E) { # >
4648     !!!parse-error (type => 'no attr default'); ## TODO: type
4649     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4650     !!!next-input-character;
4651     !!!emit ($self->{ct}); # ATTLIST
4652     redo A;
4653     } elsif ($self->{nc} == -1) {
4654     !!!parse-error (type => 'unclosed md'); ## TODO: type
4655     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4656     !!!next-input-character;
4657     !!!emit ($self->{ct});
4658     redo A;
4659     } else {
4660     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4661     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4662     ## Reconsume.
4663     redo A;
4664     }
4665     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4666     if ($is_space->{$self->{nc}}) {
4667     ## XML5: No parse error.
4668     !!!parse-error (type => 'no default type'); ## TODO: type
4669 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4670 wakaba 1.14 ## Reconsume.
4671     redo A;
4672 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4673     ## XML5: Same as "anything else".
4674     $self->{ca}->{value} = '';
4675     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4676     !!!next-input-character;
4677     redo A;
4678     } elsif ($self->{nc} == 0x0027) { # '
4679     ## XML5: Same as "anything else".
4680     $self->{ca}->{value} = '';
4681     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4682     !!!next-input-character;
4683     redo A;
4684     } elsif ($self->{nc} == 0x003E) { # >
4685     ## XML5: Same as "anything else".
4686     !!!parse-error (type => 'no attr default'); ## TODO: type
4687     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4688     !!!next-input-character;
4689     !!!emit ($self->{ct}); # ATTLIST
4690     redo A;
4691     } elsif ($self->{nc} == -1) {
4692     ## XML5: No parse error.
4693     !!!parse-error (type => 'unclosed md'); ## TODO: type
4694     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4695     !!!next-input-character;
4696     !!!emit ($self->{ct});
4697     redo A;
4698     } else {
4699     $self->{ca}->{default} = chr $self->{nc};
4700     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4701     !!!next-input-character;
4702     redo A;
4703 wakaba 1.14 }
4704 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4705     if ($is_space->{$self->{nc}}) {
4706     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4707     !!!next-input-character;
4708     redo A;
4709     } elsif ($self->{nc} == 0x0022) { # "
4710     ## XML5: Same as "anything else".
4711     !!!parse-error (type => 'no space before default value'); ## TODO: type
4712     $self->{ca}->{value} = '';
4713     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4714     !!!next-input-character;
4715     redo A;
4716     } elsif ($self->{nc} == 0x0027) { # '
4717     ## XML5: Same as "anything else".
4718     !!!parse-error (type => 'no space before default value'); ## TODO: type
4719     $self->{ca}->{value} = '';
4720     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4721     !!!next-input-character;
4722     redo A;
4723     } elsif ($self->{nc} == 0x003E) { # >
4724     ## XML5: Same as "anything else".
4725     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4726     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4727     !!!next-input-character;
4728     !!!emit ($self->{ct}); # ATTLIST
4729     redo A;
4730     } elsif ($self->{nc} == -1) {
4731     ## XML5: No parse error.
4732     !!!parse-error (type => 'unclosed md'); ## TODO: type
4733     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4734     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4735     !!!next-input-character;
4736     !!!emit ($self->{ct});
4737     redo A;
4738     } else {
4739     $self->{ca}->{default} .= chr $self->{nc};
4740     ## Stay in the state.
4741     !!!next-input-character;
4742     redo A;
4743     }
4744     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4745     if ($is_space->{$self->{nc}}) {
4746     ## Stay in the state.
4747     !!!next-input-character;
4748     redo A;
4749     } elsif ($self->{nc} == 0x0022) { # "
4750     $self->{ca}->{value} = '';
4751     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4752     !!!next-input-character;
4753     redo A;
4754     } elsif ($self->{nc} == 0x0027) { # '
4755     $self->{ca}->{value} = '';
4756     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4757     !!!next-input-character;
4758     redo A;
4759     } elsif ($self->{nc} == 0x003E) { # >
4760     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4761     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4762     !!!next-input-character;
4763     !!!emit ($self->{ct}); # ATTLIST
4764     redo A;
4765     } elsif ($self->{nc} == -1) {
4766     ## XML5: No parse error.
4767     !!!parse-error (type => 'unclosed md'); ## TODO: type
4768     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4769     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4770     !!!next-input-character;
4771     !!!emit ($self->{ct});
4772     redo A;
4773     } else {
4774     ## XML5: Not defined yet.
4775     if ($self->{ca}->{default} eq 'FIXED') {
4776     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4777     } else {
4778     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4779     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4780     }
4781     ## Reconsume.
4782     redo A;
4783     }
4784     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4785     if ($is_space->{$self->{nc}} or
4786     $self->{nc} == -1 or
4787     $self->{nc} == 0x003E) { # >
4788     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4789     ## Reconsume.
4790     redo A;
4791     } else {
4792     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4793     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4794     ## Reconsume.
4795     redo A;
4796 wakaba 1.16 }
4797 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4798     ## ASCII case-insensitive
4799     if ($self->{nc} == [
4800     undef,
4801     0x0044, # D
4802     0x0041, # A
4803     0x0054, # T
4804     ]->[length $self->{kwd}] or
4805     $self->{nc} == [
4806     undef,
4807     0x0064, # d
4808     0x0061, # a
4809     0x0074, # t
4810     ]->[length $self->{kwd}]) {
4811     !!!cp (172.2);
4812     ## Stay in the state.
4813     $self->{kwd} .= chr $self->{nc};
4814     !!!next-input-character;
4815     redo A;
4816     } elsif ((length $self->{kwd}) == 4 and
4817     ($self->{nc} == 0x0041 or # A
4818     $self->{nc} == 0x0061)) { # a
4819     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4820     !!!cp (172.3);
4821     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4822     text => 'NDATA',
4823     line => $self->{line_prev},
4824     column => $self->{column_prev} - 4);
4825     } else {
4826     !!!cp (172.4);
4827     }
4828     $self->{state} = AFTER_NDATA_STATE;
4829     !!!next-input-character;
4830     redo A;
4831     } else {
4832     !!!parse-error (type => 'string after literal', ## TODO: type
4833     line => $self->{line_prev},
4834     column => $self->{column_prev} + 1
4835     - length $self->{kwd});
4836     !!!cp (172.5);
4837     $self->{state} = BOGUS_MD_STATE;
4838     ## Reconsume.
4839     redo A;
4840     }
4841     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4842     if ($is_space->{$self->{nc}}) {
4843     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4844     !!!next-input-character;
4845     redo A;
4846     } elsif ($self->{nc} == 0x003E) { # >
4847     !!!parse-error (type => 'no notation name'); ## TODO: type
4848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4849     !!!next-input-character;
4850     !!!emit ($self->{ct}); # ENTITY
4851     redo A;
4852     } elsif ($self->{nc} == -1) {
4853     !!!parse-error (type => 'unclosed md'); ## TODO: type
4854     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855     !!!next-input-character;
4856     !!!emit ($self->{ct}); # ENTITY
4857     redo A;
4858     } else {
4859     !!!parse-error (type => 'string after literal', ## TODO: type
4860     line => $self->{line_prev},
4861     column => $self->{column_prev} + 1
4862     - length $self->{kwd});
4863     $self->{state} = BOGUS_MD_STATE;
4864     ## Reconsume.
4865     redo A;
4866     }
4867     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4868     if ($is_space->{$self->{nc}}) {
4869     ## Stay in the state.
4870     !!!next-input-character;
4871     redo A;
4872     } elsif ($self->{nc} == 0x003E) { # >
4873     !!!parse-error (type => 'no notation name'); ## TODO: type
4874     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4875     !!!next-input-character;
4876     !!!emit ($self->{ct}); # ENTITY
4877     redo A;
4878     } elsif ($self->{nc} == -1) {
4879     !!!parse-error (type => 'unclosed md'); ## TODO: type
4880     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4881     !!!next-input-character;
4882     !!!emit ($self->{ct}); # ENTITY
4883     redo A;
4884     } else {
4885     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4886     $self->{state} = NOTATION_NAME_STATE;
4887     !!!next-input-character;
4888     redo A;
4889     }
4890     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4891     if ($is_space->{$self->{nc}}) {
4892 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4893 wakaba 1.18 !!!next-input-character;
4894     redo A;
4895     } elsif ($self->{nc} == 0x003E) { # >
4896     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4897     !!!next-input-character;
4898     !!!emit ($self->{ct}); # ENTITY
4899     redo A;
4900     } elsif ($self->{nc} == -1) {
4901     !!!parse-error (type => 'unclosed md'); ## TODO: type
4902     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4903     !!!next-input-character;
4904     !!!emit ($self->{ct}); # ENTITY
4905     redo A;
4906     } else {
4907     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4908     ## Stay in the state.
4909     !!!next-input-character;
4910     redo A;
4911     }
4912 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4913     if ($self->{nc} == 0x0022) { # "
4914 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4915 wakaba 1.19 !!!next-input-character;
4916     redo A;
4917     } elsif ($self->{nc} == 0x0026) { # &
4918     $self->{prev_state} = $self->{state};
4919     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4920     $self->{entity_add} = 0x0022; # "
4921     !!!next-input-character;
4922     redo A;
4923     ## TODO: %
4924     } elsif ($self->{nc} == -1) {
4925     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4926     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4927     ## Reconsume.
4928     !!!emit ($self->{ct}); # ENTITY
4929     redo A;
4930     } else {
4931     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4932     !!!next-input-character;
4933     redo A;
4934     }
4935     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4936     if ($self->{nc} == 0x0027) { # '
4937 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4938 wakaba 1.19 !!!next-input-character;
4939     redo A;
4940     } elsif ($self->{nc} == 0x0026) { # &
4941     $self->{prev_state} = $self->{state};
4942     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4943     $self->{entity_add} = 0x0027; # '
4944     !!!next-input-character;
4945     redo A;
4946     ## TODO: %
4947     } elsif ($self->{nc} == -1) {
4948     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4950     ## Reconsume.
4951     !!!emit ($self->{ct}); # ENTITY
4952     redo A;
4953     } else {
4954     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4955     !!!next-input-character;
4956     redo A;
4957     }
4958     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4959     if ($is_space->{$self->{nc}} or
4960     {
4961     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4962     $self->{entity_add} => 1,
4963     }->{$self->{nc}}) {
4964 wakaba 1.22 !!!parse-error (type => 'bare ero',
4965     line => $self->{line_prev},
4966     column => $self->{column_prev}
4967     + ($self->{nc} == -1 ? 1 : 0));
4968 wakaba 1.19 ## Don't consume
4969     ## Return nothing.
4970     #
4971     } elsif ($self->{nc} == 0x0023) { # #
4972     $self->{ca} = $self->{ct};
4973     $self->{state} = ENTITY_HASH_STATE;
4974     $self->{kwd} = '#';
4975     !!!next-input-character;
4976     redo A;
4977     } else {
4978     #
4979     }
4980    
4981     $self->{ct}->{value} .= '&';
4982     $self->{state} = $self->{prev_state};
4983     ## Reconsume.
4984     redo A;
4985 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4986     if ($is_space->{$self->{nc}}) {
4987     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4988     !!!next-input-character;
4989     redo A;
4990     } elsif ($self->{nc} == 0x0028) { # (
4991     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4992     $self->{ct}->{content} = ['('];
4993     $self->{group_depth} = 1;
4994     !!!next-input-character;
4995     redo A;
4996     } elsif ($self->{nc} == 0x003E) { # >
4997     !!!parse-error (type => 'no md def'); ## TODO: type
4998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4999     !!!next-input-character;
5000     !!!emit ($self->{ct}); # ELEMENT
5001     redo A;
5002     } elsif ($self->{nc} == -1) {
5003     !!!parse-error (type => 'unclosed md'); ## TODO: type
5004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5005     !!!next-input-character;
5006     !!!emit ($self->{ct}); # ELEMENT
5007     redo A;
5008     } else {
5009     $self->{ct}->{content} = [chr $self->{nc}];
5010     $self->{state} = CONTENT_KEYWORD_STATE;
5011     !!!next-input-character;
5012     redo A;
5013     }
5014     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
5015     if ($is_space->{$self->{nc}}) {
5016     $self->{state} = AFTER_MD_DEF_STATE;
5017     !!!next-input-character;
5018     redo A;
5019     } elsif ($self->{nc} == 0x003E) { # >
5020     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5021     !!!next-input-character;
5022     !!!emit ($self->{ct}); # ELEMENT
5023     redo A;
5024     } elsif ($self->{nc} == -1) {
5025     !!!parse-error (type => 'unclosed md'); ## TODO: type
5026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5027     !!!next-input-character;
5028     !!!emit ($self->{ct}); # ELEMENT
5029     redo A;
5030     } else {
5031     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
5032     ## Stay in the state.
5033     !!!next-input-character;
5034     redo A;
5035     }
5036     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
5037     if ($is_space->{$self->{nc}}) {
5038     ## Stay in the state.
5039     !!!next-input-character;
5040     redo A;
5041     } elsif ($self->{nc} == 0x0028) { # (
5042     $self->{group_depth}++;
5043     push @{$self->{ct}->{content}}, chr $self->{nc};
5044     ## Stay in the state.
5045     !!!next-input-character;
5046     redo A;
5047     } elsif ($self->{nc} == 0x007C or # |
5048     $self->{nc} == 0x002C) { # ,
5049     !!!parse-error (type => 'empty element name'); ## TODO: type
5050     ## Stay in the state.
5051     !!!next-input-character;
5052     redo A;
5053     } elsif ($self->{nc} == 0x0029) { # )
5054     !!!parse-error (type => 'empty element name'); ## TODO: type
5055     push @{$self->{ct}->{content}}, chr $self->{nc};
5056     $self->{group_depth}--;
5057     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5058     !!!next-input-character;
5059     redo A;
5060     } elsif ($self->{nc} == 0x003E) { # >
5061     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5062     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5063     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5064     !!!next-input-character;
5065     !!!emit ($self->{ct}); # ELEMENT
5066     redo A;
5067     } elsif ($self->{nc} == -1) {
5068     !!!parse-error (type => 'unclosed md'); ## TODO: type
5069     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5070     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5071     !!!next-input-character;
5072     !!!emit ($self->{ct}); # ELEMENT
5073     redo A;
5074     } else {
5075     push @{$self->{ct}->{content}}, chr $self->{nc};
5076     $self->{state} = CM_ELEMENT_NAME_STATE;
5077     !!!next-input-character;
5078     redo A;
5079     }
5080     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
5081     if ($is_space->{$self->{nc}}) {
5082     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5083     !!!next-input-character;
5084     redo A;
5085     } elsif ($self->{nc} == 0x002A or # *
5086     $self->{nc} == 0x002B or # +
5087     $self->{nc} == 0x003F) { # ?
5088     push @{$self->{ct}->{content}}, chr $self->{nc};
5089     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5090     !!!next-input-character;
5091     redo A;
5092     } elsif ($self->{nc} == 0x007C or # |
5093     $self->{nc} == 0x002C) { # ,
5094     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5095     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5096     !!!next-input-character;
5097     redo A;
5098     } elsif ($self->{nc} == 0x0029) { # )
5099     $self->{group_depth}--;
5100     push @{$self->{ct}->{content}}, chr $self->{nc};
5101     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5102     !!!next-input-character;
5103     redo A;
5104     } elsif ($self->{nc} == 0x003E) { # >
5105     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5106     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5107     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5108     !!!next-input-character;
5109     !!!emit ($self->{ct}); # ELEMENT
5110     redo A;
5111     } elsif ($self->{nc} == -1) {
5112     !!!parse-error (type => 'unclosed md'); ## TODO: type
5113     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5115     !!!next-input-character;
5116     !!!emit ($self->{ct}); # ELEMENT
5117     redo A;
5118     } else {
5119     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5120     ## Stay in the state.
5121     !!!next-input-character;
5122     redo A;
5123     }
5124     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5125     if ($is_space->{$self->{nc}}) {
5126     ## Stay in the state.
5127     !!!next-input-character;
5128     redo A;
5129     } elsif ($self->{nc} == 0x007C or # |
5130     $self->{nc} == 0x002C) { # ,
5131     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5132     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5133     !!!next-input-character;
5134     redo A;
5135     } elsif ($self->{nc} == 0x0029) { # )
5136     $self->{group_depth}--;
5137     push @{$self->{ct}->{content}}, chr $self->{nc};
5138     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5139     !!!next-input-character;
5140     redo A;
5141     } elsif ($self->{nc} == 0x003E) { # >
5142     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5143     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5144     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5145     !!!next-input-character;
5146     !!!emit ($self->{ct}); # ELEMENT
5147     redo A;
5148     } elsif ($self->{nc} == -1) {
5149     !!!parse-error (type => 'unclosed md'); ## TODO: type
5150     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5151     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5152     !!!next-input-character;
5153     !!!emit ($self->{ct}); # ELEMENT
5154     redo A;
5155     } else {
5156     !!!parse-error (type => 'after element name'); ## TODO: type
5157     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5158     $self->{state} = BOGUS_MD_STATE;
5159     !!!next-input-character;
5160     redo A;
5161     }
5162     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5163     if ($is_space->{$self->{nc}}) {
5164     if ($self->{group_depth}) {
5165     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5166     } else {
5167     $self->{state} = AFTER_MD_DEF_STATE;
5168     }
5169     !!!next-input-character;
5170     redo A;
5171     } elsif ($self->{nc} == 0x002A or # *
5172     $self->{nc} == 0x002B or # +
5173     $self->{nc} == 0x003F) { # ?
5174     push @{$self->{ct}->{content}}, chr $self->{nc};
5175     if ($self->{group_depth}) {
5176     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5177     } else {
5178     $self->{state} = AFTER_MD_DEF_STATE;
5179     }
5180     !!!next-input-character;
5181     redo A;
5182     } elsif ($self->{nc} == 0x0029) { # )
5183     if ($self->{group_depth}) {
5184     $self->{group_depth}--;
5185     push @{$self->{ct}->{content}}, chr $self->{nc};
5186     ## Stay in the state.
5187     !!!next-input-character;
5188     redo A;
5189     } else {
5190     !!!parse-error (type => 'string after md def'); ## TODO: type
5191     $self->{state} = BOGUS_MD_STATE;
5192     ## Reconsume.
5193     redo A;
5194     }
5195     } elsif ($self->{nc} == 0x003E) { # >
5196     if ($self->{group_depth}) {
5197     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5198     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5199     }
5200     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5201     !!!next-input-character;
5202     !!!emit ($self->{ct}); # ELEMENT
5203     redo A;
5204     } elsif ($self->{nc} == -1) {
5205     !!!parse-error (type => 'unclosed md'); ## TODO: type
5206     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5208     !!!next-input-character;
5209     !!!emit ($self->{ct}); # ELEMENT
5210     redo A;
5211     } else {
5212     if ($self->{group_depth}) {
5213     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5214     } else {
5215     !!!parse-error (type => 'string after md def'); ## TODO: type
5216     $self->{state} = BOGUS_MD_STATE;
5217     }
5218     ## Reconsume.
5219     redo A;
5220     }
5221     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5222 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5223     ## Stay in the state.
5224     !!!next-input-character;
5225     redo A;
5226     } elsif ($self->{nc} == 0x003E) { # >
5227     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5228     !!!next-input-character;
5229 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5230 wakaba 1.18 redo A;
5231     } elsif ($self->{nc} == -1) {
5232     !!!parse-error (type => 'unclosed md'); ## TODO: type
5233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5234     !!!next-input-character;
5235 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5236 wakaba 1.18 redo A;
5237     } else {
5238 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5239 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5240     ## Reconsume.
5241     redo A;
5242     }
5243 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5244     if ($self->{nc} == 0x003E) { # >
5245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5246     !!!next-input-character;
5247     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5248     redo A;
5249     } elsif ($self->{nc} == -1) {
5250     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5251     ## Reconsume.
5252     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5253     redo A;
5254     } else {
5255     ## Stay in the state.
5256     !!!next-input-character;
5257     redo A;
5258     }
5259 wakaba 1.1 } else {
5260     die "$0: $self->{state}: Unknown state";
5261     }
5262     } # A
5263    
5264     die "$0: _get_next_token: unexpected case";
5265     } # _get_next_token
5266    
5267     1;
5268 wakaba 1.33 ## $Date: 2009/09/05 09:57:55 $
5269 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24