/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (hide annotations) (download) (as text)
Sat Sep 5 09:57:55 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.31: +60 -5 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 09:57:06 -0000
	* tokenizer-test-1.test: Added test cases for "comment end space
	state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:57:45 -0000
	space state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.32 our $VERSION=do{my @r=(q$Revision: 1.31 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209     0x0D => 0x000A,
210     0x80 => 0x20AC,
211     0x81 => 0xFFFD,
212     0x82 => 0x201A,
213     0x83 => 0x0192,
214     0x84 => 0x201E,
215     0x85 => 0x2026,
216     0x86 => 0x2020,
217     0x87 => 0x2021,
218     0x88 => 0x02C6,
219     0x89 => 0x2030,
220     0x8A => 0x0160,
221     0x8B => 0x2039,
222     0x8C => 0x0152,
223     0x8D => 0xFFFD,
224     0x8E => 0x017D,
225     0x8F => 0xFFFD,
226     0x90 => 0xFFFD,
227     0x91 => 0x2018,
228     0x92 => 0x2019,
229     0x93 => 0x201C,
230     0x94 => 0x201D,
231     0x95 => 0x2022,
232     0x96 => 0x2013,
233     0x97 => 0x2014,
234     0x98 => 0x02DC,
235     0x99 => 0x2122,
236     0x9A => 0x0161,
237     0x9B => 0x203A,
238     0x9C => 0x0153,
239     0x9D => 0xFFFD,
240     0x9E => 0x017E,
241     0x9F => 0x0178,
242     }; # $charref_map
243     $charref_map->{$_} = 0xFFFD
244     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251    
252     ## Implementations MUST act as if state machine in the spec
253    
254     sub _initialize_tokenizer ($) {
255     my $self = shift;
256    
257     ## NOTE: Fields set by |new| constructor:
258     #$self->{level}
259     #$self->{set_nc}
260     #$self->{parse_error}
261 wakaba 1.3 #$self->{is_xml} (if XML)
262 wakaba 1.1
263     $self->{state} = DATA_STATE; # MUST
264 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
265     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 wakaba 1.1 #$self->{entity__value}; # initialized when used
267     #$self->{entity__match}; # initialized when used
268     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269     undef $self->{ct}; # current token
270     undef $self->{ca}; # current attribute
271     undef $self->{last_stag_name}; # last emitted start tag name
272     #$self->{prev_state}; # initialized when used
273     delete $self->{self_closing};
274     $self->{char_buffer} = '';
275     $self->{char_buffer_pos} = 0;
276     $self->{nc} = -1; # next input character
277     #$self->{next_nc}
278     !!!next-input-character;
279     $self->{token} = [];
280     # $self->{escape}
281     } # _initialize_tokenizer
282    
283     ## A token has:
284     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
285 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
286 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
287     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
288 wakaba 1.11 ## ->{target} (PI_TOKEN)
289 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
290     ## ->{sysid} (DOCTYPE_TOKEN)
291     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
292     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
293     ## ->{name}
294     ## ->{value}
295     ## ->{has_reference} == 1 or 0
296 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
297     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
298 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
299 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
300 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
301    
302 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
303     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
304     ## while the token is pushed back to the stack.
305    
306     ## Emitted token MUST immediately be handled by the tree construction state.
307    
308     ## Before each step, UA MAY check to see if either one of the scripts in
309     ## "list of scripts that will execute as soon as possible" or the first
310     ## script in the "list of scripts that will execute asynchronously",
311     ## has completed loading. If one has, then it MUST be executed
312     ## and removed from the list.
313    
314     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
315     ## (This requirement was dropped from HTML5 spec, unfortunately.)
316    
317     my $is_space = {
318     0x0009 => 1, # CHARACTER TABULATION (HT)
319     0x000A => 1, # LINE FEED (LF)
320     #0x000B => 0, # LINE TABULATION (VT)
321 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
322 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
323     0x0020 => 1, # SPACE (SP)
324     };
325    
326     sub _get_next_token ($) {
327     my $self = shift;
328    
329     if ($self->{self_closing}) {
330     !!!parse-error (type => 'nestc', token => $self->{ct});
331     ## NOTE: The |self_closing| flag is only set by start tag token.
332     ## In addition, when a start tag token is emitted, it is always set to
333     ## |ct|.
334     delete $self->{self_closing};
335     }
336    
337     if (@{$self->{token}}) {
338     $self->{self_closing} = $self->{token}->[0]->{self_closing};
339     return shift @{$self->{token}};
340     }
341    
342     A: {
343     if ($self->{state} == PCDATA_STATE) {
344     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
345    
346     if ($self->{nc} == 0x0026) { # &
347     !!!cp (0.1);
348     ## NOTE: In the spec, the tokenizer is switched to the
349     ## "entity data state". In this implementation, the tokenizer
350     ## is switched to the |ENTITY_STATE|, which is an implementation
351     ## of the "consume a character reference" algorithm.
352     $self->{entity_add} = -1;
353     $self->{prev_state} = DATA_STATE;
354     $self->{state} = ENTITY_STATE;
355     !!!next-input-character;
356     redo A;
357     } elsif ($self->{nc} == 0x003C) { # <
358     !!!cp (0.2);
359     $self->{state} = TAG_OPEN_STATE;
360     !!!next-input-character;
361     redo A;
362     } elsif ($self->{nc} == -1) {
363     !!!cp (0.3);
364     !!!emit ({type => END_OF_FILE_TOKEN,
365     line => $self->{line}, column => $self->{column}});
366     last A; ## TODO: ok?
367     } else {
368     !!!cp (0.4);
369     #
370     }
371    
372     # Anything else
373     my $token = {type => CHARACTER_TOKEN,
374     data => chr $self->{nc},
375     line => $self->{line}, column => $self->{column},
376     };
377     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
378    
379     ## Stay in the state.
380     !!!next-input-character;
381     !!!emit ($token);
382     redo A;
383     } elsif ($self->{state} == DATA_STATE) {
384     $self->{s_kwd} = '' unless defined $self->{s_kwd};
385     if ($self->{nc} == 0x0026) { # &
386     $self->{s_kwd} = '';
387     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
388     not $self->{escape}) {
389     !!!cp (1);
390     ## NOTE: In the spec, the tokenizer is switched to the
391     ## "entity data state". In this implementation, the tokenizer
392     ## is switched to the |ENTITY_STATE|, which is an implementation
393     ## of the "consume a character reference" algorithm.
394     $self->{entity_add} = -1;
395     $self->{prev_state} = DATA_STATE;
396     $self->{state} = ENTITY_STATE;
397     !!!next-input-character;
398     redo A;
399     } else {
400     !!!cp (2);
401     #
402     }
403     } elsif ($self->{nc} == 0x002D) { # -
404     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
405 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
406 wakaba 1.1 !!!cp (3);
407     $self->{escape} = 1; # unless $self->{escape};
408     $self->{s_kwd} = '--';
409     #
410 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
411 wakaba 1.1 !!!cp (4);
412     $self->{s_kwd} = '--';
413     #
414 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
415     !!!cp (4.1);
416     $self->{s_kwd} .= '-';
417     #
418 wakaba 1.1 } else {
419     !!!cp (5);
420 wakaba 1.5 $self->{s_kwd} = '-';
421 wakaba 1.1 #
422     }
423     }
424    
425     #
426     } elsif ($self->{nc} == 0x0021) { # !
427     if (length $self->{s_kwd}) {
428     !!!cp (5.1);
429     $self->{s_kwd} .= '!';
430     #
431     } else {
432     !!!cp (5.2);
433     #$self->{s_kwd} = '';
434     #
435     }
436     #
437     } elsif ($self->{nc} == 0x003C) { # <
438     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
439     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
440     not $self->{escape})) {
441     !!!cp (6);
442     $self->{state} = TAG_OPEN_STATE;
443     !!!next-input-character;
444     redo A;
445     } else {
446     !!!cp (7);
447     $self->{s_kwd} = '';
448     #
449     }
450     } elsif ($self->{nc} == 0x003E) { # >
451     if ($self->{escape} and
452     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
453     if ($self->{s_kwd} eq '--') {
454     !!!cp (8);
455     delete $self->{escape};
456 wakaba 1.5 #
457 wakaba 1.1 } else {
458     !!!cp (9);
459 wakaba 1.5 #
460 wakaba 1.1 }
461 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
462     !!!cp (9.1);
463     !!!parse-error (type => 'unmatched mse', ## TODO: type
464     line => $self->{line_prev},
465     column => $self->{column_prev} - 1);
466     #
467 wakaba 1.1 } else {
468     !!!cp (10);
469 wakaba 1.5 #
470 wakaba 1.1 }
471    
472     $self->{s_kwd} = '';
473     #
474 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
475     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
476     !!!cp (10.1);
477     $self->{s_kwd} .= ']';
478     } elsif ($self->{s_kwd} eq ']]') {
479     !!!cp (10.2);
480     #
481     } else {
482     !!!cp (10.3);
483     $self->{s_kwd} = '';
484     }
485     #
486 wakaba 1.1 } elsif ($self->{nc} == -1) {
487     !!!cp (11);
488     $self->{s_kwd} = '';
489     !!!emit ({type => END_OF_FILE_TOKEN,
490     line => $self->{line}, column => $self->{column}});
491     last A; ## TODO: ok?
492     } else {
493     !!!cp (12);
494     $self->{s_kwd} = '';
495     #
496     }
497    
498     # Anything else
499     my $token = {type => CHARACTER_TOKEN,
500     data => chr $self->{nc},
501     line => $self->{line}, column => $self->{column},
502     };
503 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
504 wakaba 1.1 length $token->{data})) {
505     $self->{s_kwd} = '';
506     }
507    
508     ## Stay in the data state.
509 wakaba 1.5 if (not $self->{is_xml} and
510     $self->{content_model} == PCDATA_CONTENT_MODEL) {
511 wakaba 1.1 !!!cp (13);
512     $self->{state} = PCDATA_STATE;
513     } else {
514     !!!cp (14);
515     ## Stay in the state.
516     }
517     !!!next-input-character;
518     !!!emit ($token);
519     redo A;
520     } elsif ($self->{state} == TAG_OPEN_STATE) {
521 wakaba 1.10 ## XML5: "tag state".
522    
523 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
524     if ($self->{nc} == 0x002F) { # /
525     !!!cp (15);
526     !!!next-input-character;
527     $self->{state} = CLOSE_TAG_OPEN_STATE;
528     redo A;
529     } elsif ($self->{nc} == 0x0021) { # !
530     !!!cp (15.1);
531 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
532 wakaba 1.1 #
533     } else {
534     !!!cp (16);
535 wakaba 1.12 $self->{s_kwd} = '';
536 wakaba 1.1 #
537     }
538    
539     ## reconsume
540     $self->{state} = DATA_STATE;
541     !!!emit ({type => CHARACTER_TOKEN, data => '<',
542     line => $self->{line_prev},
543     column => $self->{column_prev},
544     });
545     redo A;
546     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
547     if ($self->{nc} == 0x0021) { # !
548     !!!cp (17);
549     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
550     !!!next-input-character;
551     redo A;
552     } elsif ($self->{nc} == 0x002F) { # /
553     !!!cp (18);
554     $self->{state} = CLOSE_TAG_OPEN_STATE;
555     !!!next-input-character;
556     redo A;
557     } elsif (0x0041 <= $self->{nc} and
558     $self->{nc} <= 0x005A) { # A..Z
559     !!!cp (19);
560     $self->{ct}
561     = {type => START_TAG_TOKEN,
562 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 wakaba 1.1 line => $self->{line_prev},
564     column => $self->{column_prev}};
565     $self->{state} = TAG_NAME_STATE;
566     !!!next-input-character;
567     redo A;
568     } elsif (0x0061 <= $self->{nc} and
569     $self->{nc} <= 0x007A) { # a..z
570     !!!cp (20);
571     $self->{ct} = {type => START_TAG_TOKEN,
572     tag_name => chr ($self->{nc}),
573     line => $self->{line_prev},
574     column => $self->{column_prev}};
575     $self->{state} = TAG_NAME_STATE;
576     !!!next-input-character;
577     redo A;
578     } elsif ($self->{nc} == 0x003E) { # >
579     !!!cp (21);
580     !!!parse-error (type => 'empty start tag',
581     line => $self->{line_prev},
582     column => $self->{column_prev});
583     $self->{state} = DATA_STATE;
584 wakaba 1.5 $self->{s_kwd} = '';
585 wakaba 1.1 !!!next-input-character;
586    
587     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
588     line => $self->{line_prev},
589     column => $self->{column_prev},
590     });
591    
592     redo A;
593     } elsif ($self->{nc} == 0x003F) { # ?
594 wakaba 1.8 if ($self->{is_xml}) {
595     !!!cp (22.1);
596     $self->{state} = PI_STATE;
597     !!!next-input-character;
598     redo A;
599     } else {
600     !!!cp (22);
601     !!!parse-error (type => 'pio',
602     line => $self->{line_prev},
603     column => $self->{column_prev});
604     $self->{state} = BOGUS_COMMENT_STATE;
605     $self->{ct} = {type => COMMENT_TOKEN, data => '',
606     line => $self->{line_prev},
607     column => $self->{column_prev},
608     };
609     ## $self->{nc} is intentionally left as is
610     redo A;
611     }
612 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
613 wakaba 1.1 !!!cp (23);
614     !!!parse-error (type => 'bare stago',
615     line => $self->{line_prev},
616     column => $self->{column_prev});
617     $self->{state} = DATA_STATE;
618 wakaba 1.5 $self->{s_kwd} = '';
619 wakaba 1.1 ## reconsume
620    
621     !!!emit ({type => CHARACTER_TOKEN, data => '<',
622     line => $self->{line_prev},
623     column => $self->{column_prev},
624     });
625    
626     redo A;
627 wakaba 1.9 } else {
628     ## XML5: "<:" is a parse error.
629     !!!cp (23.1);
630     $self->{ct} = {type => START_TAG_TOKEN,
631     tag_name => chr ($self->{nc}),
632     line => $self->{line_prev},
633     column => $self->{column_prev}};
634     $self->{state} = TAG_NAME_STATE;
635     !!!next-input-character;
636     redo A;
637 wakaba 1.1 }
638     } else {
639     die "$0: $self->{content_model} in tag open";
640     }
641     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
642     ## NOTE: The "close tag open state" in the spec is implemented as
643     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
644    
645 wakaba 1.10 ## XML5: "end tag state".
646    
647 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
648     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
649     if (defined $self->{last_stag_name}) {
650     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
651 wakaba 1.12 $self->{kwd} = '';
652 wakaba 1.1 ## Reconsume.
653     redo A;
654     } else {
655     ## No start tag token has ever been emitted
656     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
657     !!!cp (28);
658     $self->{state} = DATA_STATE;
659 wakaba 1.5 $self->{s_kwd} = '';
660 wakaba 1.1 ## Reconsume.
661     !!!emit ({type => CHARACTER_TOKEN, data => '</',
662     line => $l, column => $c,
663     });
664     redo A;
665     }
666     }
667    
668     if (0x0041 <= $self->{nc} and
669     $self->{nc} <= 0x005A) { # A..Z
670     !!!cp (29);
671     $self->{ct}
672     = {type => END_TAG_TOKEN,
673 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
674 wakaba 1.1 line => $l, column => $c};
675     $self->{state} = TAG_NAME_STATE;
676     !!!next-input-character;
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680     !!!cp (30);
681     $self->{ct} = {type => END_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $l, column => $c};
684     $self->{state} = TAG_NAME_STATE;
685     !!!next-input-character;
686     redo A;
687     } elsif ($self->{nc} == 0x003E) { # >
688     !!!parse-error (type => 'empty end tag',
689     line => $self->{line_prev}, ## "<" in "</>"
690     column => $self->{column_prev} - 1);
691     $self->{state} = DATA_STATE;
692 wakaba 1.5 $self->{s_kwd} = '';
693 wakaba 1.10 if ($self->{is_xml}) {
694     !!!cp (31);
695     ## XML5: No parse error.
696    
697     ## NOTE: This parser raises a parse error, since it supports
698     ## XML1, not XML5.
699    
700     ## NOTE: A short end tag token.
701     my $ct = {type => END_TAG_TOKEN,
702     tag_name => '',
703     line => $self->{line_prev},
704     column => $self->{column_prev} - 1,
705     };
706     !!!next-input-character;
707     !!!emit ($ct);
708     } else {
709     !!!cp (31.1);
710     !!!next-input-character;
711     }
712 wakaba 1.1 redo A;
713     } elsif ($self->{nc} == -1) {
714     !!!cp (32);
715     !!!parse-error (type => 'bare etago');
716 wakaba 1.5 $self->{s_kwd} = '';
717 wakaba 1.1 $self->{state} = DATA_STATE;
718     # reconsume
719    
720     !!!emit ({type => CHARACTER_TOKEN, data => '</',
721     line => $l, column => $c,
722     });
723    
724     redo A;
725 wakaba 1.10 } elsif (not $self->{is_xml} or
726     $is_space->{$self->{nc}}) {
727 wakaba 1.1 !!!cp (33);
728 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
729     line => $self->{line_prev}, # "<" of "</"
730     column => $self->{column_prev} - 1);
731 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
732     $self->{ct} = {type => COMMENT_TOKEN, data => '',
733     line => $self->{line_prev}, # "<" of "</"
734     column => $self->{column_prev} - 1,
735     };
736     ## NOTE: $self->{nc} is intentionally left as is.
737     ## Although the "anything else" case of the spec not explicitly
738     ## states that the next input character is to be reconsumed,
739     ## it will be included to the |data| of the comment token
740     ## generated from the bogus end tag, as defined in the
741     ## "bogus comment state" entry.
742     redo A;
743 wakaba 1.10 } else {
744     ## XML5: "</:" is a parse error.
745     !!!cp (30.1);
746     $self->{ct} = {type => END_TAG_TOKEN,
747     tag_name => chr ($self->{nc}),
748     line => $l, column => $c};
749     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
750     !!!next-input-character;
751     redo A;
752 wakaba 1.1 }
753     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
754 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
755 wakaba 1.1 if (length $ch) {
756     my $CH = $ch;
757     $ch =~ tr/a-z/A-Z/;
758     my $nch = chr $self->{nc};
759     if ($nch eq $ch or $nch eq $CH) {
760     !!!cp (24);
761     ## Stay in the state.
762 wakaba 1.12 $self->{kwd} .= $nch;
763 wakaba 1.1 !!!next-input-character;
764     redo A;
765     } else {
766     !!!cp (25);
767     $self->{state} = DATA_STATE;
768 wakaba 1.5 $self->{s_kwd} = '';
769 wakaba 1.1 ## Reconsume.
770     !!!emit ({type => CHARACTER_TOKEN,
771 wakaba 1.12 data => '</' . $self->{kwd},
772 wakaba 1.1 line => $self->{line_prev},
773 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
774 wakaba 1.1 });
775     redo A;
776     }
777     } else { # after "<{tag-name}"
778     unless ($is_space->{$self->{nc}} or
779     {
780     0x003E => 1, # >
781     0x002F => 1, # /
782     -1 => 1, # EOF
783     }->{$self->{nc}}) {
784     !!!cp (26);
785     ## Reconsume.
786     $self->{state} = DATA_STATE;
787 wakaba 1.5 $self->{s_kwd} = '';
788 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
789 wakaba 1.12 data => '</' . $self->{kwd},
790 wakaba 1.1 line => $self->{line_prev},
791 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
792 wakaba 1.1 });
793     redo A;
794     } else {
795     !!!cp (27);
796     $self->{ct}
797     = {type => END_TAG_TOKEN,
798     tag_name => $self->{last_stag_name},
799     line => $self->{line_prev},
800 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
801 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
802     ## Reconsume.
803     redo A;
804     }
805     }
806     } elsif ($self->{state} == TAG_NAME_STATE) {
807     if ($is_space->{$self->{nc}}) {
808     !!!cp (34);
809     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
810     !!!next-input-character;
811     redo A;
812     } elsif ($self->{nc} == 0x003E) { # >
813     if ($self->{ct}->{type} == START_TAG_TOKEN) {
814     !!!cp (35);
815     $self->{last_stag_name} = $self->{ct}->{tag_name};
816     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
817     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
818     #if ($self->{ct}->{attributes}) {
819     # ## NOTE: This should never be reached.
820     # !!! cp (36);
821     # !!! parse-error (type => 'end tag attribute');
822     #} else {
823     !!!cp (37);
824     #}
825     } else {
826     die "$0: $self->{ct}->{type}: Unknown token type";
827     }
828     $self->{state} = DATA_STATE;
829 wakaba 1.5 $self->{s_kwd} = '';
830 wakaba 1.1 !!!next-input-character;
831    
832     !!!emit ($self->{ct}); # start tag or end tag
833    
834     redo A;
835     } elsif (0x0041 <= $self->{nc} and
836     $self->{nc} <= 0x005A) { # A..Z
837     !!!cp (38);
838 wakaba 1.4 $self->{ct}->{tag_name}
839     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840 wakaba 1.1 # start tag or end tag
841     ## Stay in this state
842     !!!next-input-character;
843     redo A;
844     } elsif ($self->{nc} == -1) {
845     !!!parse-error (type => 'unclosed tag');
846     if ($self->{ct}->{type} == START_TAG_TOKEN) {
847     !!!cp (39);
848     $self->{last_stag_name} = $self->{ct}->{tag_name};
849     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
850     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
851     #if ($self->{ct}->{attributes}) {
852     # ## NOTE: This state should never be reached.
853     # !!! cp (40);
854     # !!! parse-error (type => 'end tag attribute');
855     #} else {
856     !!!cp (41);
857     #}
858     } else {
859     die "$0: $self->{ct}->{type}: Unknown token type";
860     }
861     $self->{state} = DATA_STATE;
862 wakaba 1.5 $self->{s_kwd} = '';
863 wakaba 1.1 # reconsume
864    
865     !!!emit ($self->{ct}); # start tag or end tag
866    
867     redo A;
868     } elsif ($self->{nc} == 0x002F) { # /
869     !!!cp (42);
870     $self->{state} = SELF_CLOSING_START_TAG_STATE;
871     !!!next-input-character;
872     redo A;
873     } else {
874     !!!cp (44);
875     $self->{ct}->{tag_name} .= chr $self->{nc};
876     # start tag or end tag
877     ## Stay in the state
878     !!!next-input-character;
879     redo A;
880     }
881     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
882 wakaba 1.11 ## XML5: "Tag attribute name before state".
883    
884 wakaba 1.1 if ($is_space->{$self->{nc}}) {
885     !!!cp (45);
886     ## Stay in the state
887     !!!next-input-character;
888     redo A;
889     } elsif ($self->{nc} == 0x003E) { # >
890     if ($self->{ct}->{type} == START_TAG_TOKEN) {
891     !!!cp (46);
892     $self->{last_stag_name} = $self->{ct}->{tag_name};
893     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895     if ($self->{ct}->{attributes}) {
896     !!!cp (47);
897     !!!parse-error (type => 'end tag attribute');
898     } else {
899     !!!cp (48);
900     }
901     } else {
902     die "$0: $self->{ct}->{type}: Unknown token type";
903     }
904     $self->{state} = DATA_STATE;
905 wakaba 1.5 $self->{s_kwd} = '';
906 wakaba 1.1 !!!next-input-character;
907    
908     !!!emit ($self->{ct}); # start tag or end tag
909    
910     redo A;
911     } elsif (0x0041 <= $self->{nc} and
912     $self->{nc} <= 0x005A) { # A..Z
913     !!!cp (49);
914     $self->{ca}
915 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
916 wakaba 1.1 value => '',
917     line => $self->{line}, column => $self->{column}};
918     $self->{state} = ATTRIBUTE_NAME_STATE;
919     !!!next-input-character;
920     redo A;
921     } elsif ($self->{nc} == 0x002F) { # /
922     !!!cp (50);
923     $self->{state} = SELF_CLOSING_START_TAG_STATE;
924     !!!next-input-character;
925     redo A;
926     } elsif ($self->{nc} == -1) {
927     !!!parse-error (type => 'unclosed tag');
928     if ($self->{ct}->{type} == START_TAG_TOKEN) {
929     !!!cp (52);
930     $self->{last_stag_name} = $self->{ct}->{tag_name};
931     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
932     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
933     if ($self->{ct}->{attributes}) {
934     !!!cp (53);
935     !!!parse-error (type => 'end tag attribute');
936     } else {
937     !!!cp (54);
938     }
939     } else {
940     die "$0: $self->{ct}->{type}: Unknown token type";
941     }
942     $self->{state} = DATA_STATE;
943 wakaba 1.5 $self->{s_kwd} = '';
944 wakaba 1.1 # reconsume
945    
946     !!!emit ($self->{ct}); # start tag or end tag
947    
948     redo A;
949     } else {
950     if ({
951     0x0022 => 1, # "
952     0x0027 => 1, # '
953 wakaba 1.30 0x003C => 1, # <
954 wakaba 1.1 0x003D => 1, # =
955     }->{$self->{nc}}) {
956     !!!cp (55);
957 wakaba 1.11 ## XML5: Not a parse error.
958 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
959     } else {
960     !!!cp (56);
961 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
962 wakaba 1.1 }
963     $self->{ca}
964     = {name => chr ($self->{nc}),
965     value => '',
966     line => $self->{line}, column => $self->{column}};
967     $self->{state} = ATTRIBUTE_NAME_STATE;
968     !!!next-input-character;
969     redo A;
970     }
971     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
972 wakaba 1.11 ## XML5: "Tag attribute name state".
973    
974 wakaba 1.1 my $before_leave = sub {
975     if (exists $self->{ct}->{attributes} # start tag or end tag
976     ->{$self->{ca}->{name}}) { # MUST
977     !!!cp (57);
978     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
979     ## Discard $self->{ca} # MUST
980     } else {
981     !!!cp (58);
982     $self->{ct}->{attributes}->{$self->{ca}->{name}}
983     = $self->{ca};
984 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
985 wakaba 1.1 }
986     }; # $before_leave
987    
988     if ($is_space->{$self->{nc}}) {
989     !!!cp (59);
990     $before_leave->();
991     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
992     !!!next-input-character;
993     redo A;
994     } elsif ($self->{nc} == 0x003D) { # =
995     !!!cp (60);
996     $before_leave->();
997     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
998     !!!next-input-character;
999     redo A;
1000     } elsif ($self->{nc} == 0x003E) { # >
1001 wakaba 1.11 if ($self->{is_xml}) {
1002     !!!cp (60.1);
1003     ## XML5: Not a parse error.
1004     !!!parse-error (type => 'no attr value'); ## TODO: type
1005     } else {
1006     !!!cp (60.2);
1007     }
1008    
1009 wakaba 1.1 $before_leave->();
1010     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1011     !!!cp (61);
1012     $self->{last_stag_name} = $self->{ct}->{tag_name};
1013     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1014     !!!cp (62);
1015     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1016     if ($self->{ct}->{attributes}) {
1017     !!!parse-error (type => 'end tag attribute');
1018     }
1019     } else {
1020     die "$0: $self->{ct}->{type}: Unknown token type";
1021     }
1022     $self->{state} = DATA_STATE;
1023 wakaba 1.5 $self->{s_kwd} = '';
1024 wakaba 1.1 !!!next-input-character;
1025    
1026     !!!emit ($self->{ct}); # start tag or end tag
1027    
1028     redo A;
1029     } elsif (0x0041 <= $self->{nc} and
1030     $self->{nc} <= 0x005A) { # A..Z
1031     !!!cp (63);
1032 wakaba 1.4 $self->{ca}->{name}
1033     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1034 wakaba 1.1 ## Stay in the state
1035     !!!next-input-character;
1036     redo A;
1037     } elsif ($self->{nc} == 0x002F) { # /
1038 wakaba 1.11 if ($self->{is_xml}) {
1039     !!!cp (64);
1040     ## XML5: Not a parse error.
1041     !!!parse-error (type => 'no attr value'); ## TODO: type
1042     } else {
1043     !!!cp (64.1);
1044     }
1045    
1046 wakaba 1.1 $before_leave->();
1047     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1048     !!!next-input-character;
1049     redo A;
1050     } elsif ($self->{nc} == -1) {
1051     !!!parse-error (type => 'unclosed tag');
1052     $before_leave->();
1053     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1054     !!!cp (66);
1055     $self->{last_stag_name} = $self->{ct}->{tag_name};
1056     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1057     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1058     if ($self->{ct}->{attributes}) {
1059     !!!cp (67);
1060     !!!parse-error (type => 'end tag attribute');
1061     } else {
1062     ## NOTE: This state should never be reached.
1063     !!!cp (68);
1064     }
1065     } else {
1066     die "$0: $self->{ct}->{type}: Unknown token type";
1067     }
1068     $self->{state} = DATA_STATE;
1069 wakaba 1.5 $self->{s_kwd} = '';
1070 wakaba 1.1 # reconsume
1071    
1072     !!!emit ($self->{ct}); # start tag or end tag
1073    
1074     redo A;
1075     } else {
1076 wakaba 1.30 if ({
1077     0x0022 => 1, # "
1078     0x0027 => 1, # '
1079     0x003C => 1, # <
1080     }->{$self->{nc}}) {
1081 wakaba 1.1 !!!cp (69);
1082 wakaba 1.11 ## XML5: Not a parse error.
1083 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1084     } else {
1085     !!!cp (70);
1086     }
1087     $self->{ca}->{name} .= chr ($self->{nc});
1088     ## Stay in the state
1089     !!!next-input-character;
1090     redo A;
1091     }
1092     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1093 wakaba 1.11 ## XML5: "Tag attribute name after state".
1094    
1095 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1096     !!!cp (71);
1097     ## Stay in the state
1098     !!!next-input-character;
1099     redo A;
1100     } elsif ($self->{nc} == 0x003D) { # =
1101     !!!cp (72);
1102     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1103     !!!next-input-character;
1104     redo A;
1105     } elsif ($self->{nc} == 0x003E) { # >
1106 wakaba 1.11 if ($self->{is_xml}) {
1107     !!!cp (72.1);
1108     ## XML5: Not a parse error.
1109     !!!parse-error (type => 'no attr value'); ## TODO: type
1110     } else {
1111     !!!cp (72.2);
1112     }
1113    
1114 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115     !!!cp (73);
1116     $self->{last_stag_name} = $self->{ct}->{tag_name};
1117     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119     if ($self->{ct}->{attributes}) {
1120     !!!cp (74);
1121     !!!parse-error (type => 'end tag attribute');
1122     } else {
1123     ## NOTE: This state should never be reached.
1124     !!!cp (75);
1125     }
1126     } else {
1127     die "$0: $self->{ct}->{type}: Unknown token type";
1128     }
1129     $self->{state} = DATA_STATE;
1130 wakaba 1.5 $self->{s_kwd} = '';
1131 wakaba 1.1 !!!next-input-character;
1132    
1133     !!!emit ($self->{ct}); # start tag or end tag
1134    
1135     redo A;
1136     } elsif (0x0041 <= $self->{nc} and
1137     $self->{nc} <= 0x005A) { # A..Z
1138     !!!cp (76);
1139     $self->{ca}
1140 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1141 wakaba 1.1 value => '',
1142     line => $self->{line}, column => $self->{column}};
1143     $self->{state} = ATTRIBUTE_NAME_STATE;
1144     !!!next-input-character;
1145     redo A;
1146     } elsif ($self->{nc} == 0x002F) { # /
1147 wakaba 1.11 if ($self->{is_xml}) {
1148     !!!cp (77);
1149     ## XML5: Not a parse error.
1150     !!!parse-error (type => 'no attr value'); ## TODO: type
1151     } else {
1152     !!!cp (77.1);
1153     }
1154    
1155 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1156     !!!next-input-character;
1157     redo A;
1158     } elsif ($self->{nc} == -1) {
1159     !!!parse-error (type => 'unclosed tag');
1160     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1161     !!!cp (79);
1162     $self->{last_stag_name} = $self->{ct}->{tag_name};
1163     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1164     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1165     if ($self->{ct}->{attributes}) {
1166     !!!cp (80);
1167     !!!parse-error (type => 'end tag attribute');
1168     } else {
1169     ## NOTE: This state should never be reached.
1170     !!!cp (81);
1171     }
1172     } else {
1173     die "$0: $self->{ct}->{type}: Unknown token type";
1174     }
1175 wakaba 1.5 $self->{s_kwd} = '';
1176 wakaba 1.1 $self->{state} = DATA_STATE;
1177     # reconsume
1178    
1179     !!!emit ($self->{ct}); # start tag or end tag
1180    
1181     redo A;
1182     } else {
1183 wakaba 1.11 if ($self->{is_xml}) {
1184     !!!cp (78.1);
1185     ## XML5: Not a parse error.
1186     !!!parse-error (type => 'no attr value'); ## TODO: type
1187     } else {
1188     !!!cp (78.2);
1189     }
1190    
1191 wakaba 1.30 if ({
1192     0x0022 => 1, # "
1193     0x0027 => 1, # '
1194     0x003C => 1, # <
1195     }->{$self->{nc}}) {
1196 wakaba 1.1 !!!cp (78);
1197 wakaba 1.11 ## XML5: Not a parse error.
1198 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1199     } else {
1200     !!!cp (82);
1201     }
1202     $self->{ca}
1203     = {name => chr ($self->{nc}),
1204     value => '',
1205     line => $self->{line}, column => $self->{column}};
1206     $self->{state} = ATTRIBUTE_NAME_STATE;
1207     !!!next-input-character;
1208     redo A;
1209     }
1210     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1211 wakaba 1.11 ## XML5: "Tag attribute value before state".
1212    
1213 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1214     !!!cp (83);
1215     ## Stay in the state
1216     !!!next-input-character;
1217     redo A;
1218     } elsif ($self->{nc} == 0x0022) { # "
1219     !!!cp (84);
1220     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1221     !!!next-input-character;
1222     redo A;
1223     } elsif ($self->{nc} == 0x0026) { # &
1224     !!!cp (85);
1225     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1226     ## reconsume
1227     redo A;
1228     } elsif ($self->{nc} == 0x0027) { # '
1229     !!!cp (86);
1230     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1231     !!!next-input-character;
1232     redo A;
1233     } elsif ($self->{nc} == 0x003E) { # >
1234     !!!parse-error (type => 'empty unquoted attribute value');
1235     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1236     !!!cp (87);
1237     $self->{last_stag_name} = $self->{ct}->{tag_name};
1238     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1239     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1240     if ($self->{ct}->{attributes}) {
1241     !!!cp (88);
1242     !!!parse-error (type => 'end tag attribute');
1243     } else {
1244     ## NOTE: This state should never be reached.
1245     !!!cp (89);
1246     }
1247     } else {
1248     die "$0: $self->{ct}->{type}: Unknown token type";
1249     }
1250     $self->{state} = DATA_STATE;
1251 wakaba 1.5 $self->{s_kwd} = '';
1252 wakaba 1.1 !!!next-input-character;
1253    
1254     !!!emit ($self->{ct}); # start tag or end tag
1255    
1256     redo A;
1257     } elsif ($self->{nc} == -1) {
1258     !!!parse-error (type => 'unclosed tag');
1259     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1260     !!!cp (90);
1261     $self->{last_stag_name} = $self->{ct}->{tag_name};
1262     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1263     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1264     if ($self->{ct}->{attributes}) {
1265     !!!cp (91);
1266     !!!parse-error (type => 'end tag attribute');
1267     } else {
1268     ## NOTE: This state should never be reached.
1269     !!!cp (92);
1270     }
1271     } else {
1272     die "$0: $self->{ct}->{type}: Unknown token type";
1273     }
1274     $self->{state} = DATA_STATE;
1275 wakaba 1.5 $self->{s_kwd} = '';
1276 wakaba 1.1 ## reconsume
1277    
1278     !!!emit ($self->{ct}); # start tag or end tag
1279    
1280     redo A;
1281     } else {
1282 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1283 wakaba 1.1 !!!cp (93);
1284 wakaba 1.11 ## XML5: Not a parse error.
1285 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1286 wakaba 1.11 } elsif ($self->{is_xml}) {
1287     !!!cp (93.1);
1288     ## XML5: No parse error.
1289     !!!parse-error (type => 'unquoted attr value'); ## TODO
1290 wakaba 1.1 } else {
1291     !!!cp (94);
1292     }
1293     $self->{ca}->{value} .= chr ($self->{nc});
1294     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1295     !!!next-input-character;
1296     redo A;
1297     }
1298     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1299 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1300     ## ATTLIST attribute value double quoted state".
1301 wakaba 1.11
1302 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1303 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1304     !!!cp (95.1);
1305     ## XML5: "DOCTYPE ATTLIST name after state".
1306     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1307     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1308     } else {
1309     !!!cp (95);
1310     ## XML5: "Tag attribute name before state".
1311     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1312     }
1313 wakaba 1.1 !!!next-input-character;
1314     redo A;
1315     } elsif ($self->{nc} == 0x0026) { # &
1316     !!!cp (96);
1317 wakaba 1.11 ## XML5: Not defined yet.
1318    
1319 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1320     ## "entity in attribute value state". In this implementation, the
1321     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1322     ## implementation of the "consume a character reference" algorithm.
1323     $self->{prev_state} = $self->{state};
1324     $self->{entity_add} = 0x0022; # "
1325     $self->{state} = ENTITY_STATE;
1326     !!!next-input-character;
1327     redo A;
1328 wakaba 1.25 } elsif ($self->{is_xml} and
1329     $is_space->{$self->{nc}}) {
1330     !!!cp (97.1);
1331     $self->{ca}->{value} .= ' ';
1332     ## Stay in the state.
1333     !!!next-input-character;
1334     redo A;
1335 wakaba 1.1 } elsif ($self->{nc} == -1) {
1336     !!!parse-error (type => 'unclosed attribute value');
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338     !!!cp (97);
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340 wakaba 1.15
1341     $self->{state} = DATA_STATE;
1342     $self->{s_kwd} = '';
1343     ## reconsume
1344     !!!emit ($self->{ct}); # start tag
1345     redo A;
1346 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1347     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1348     if ($self->{ct}->{attributes}) {
1349     !!!cp (98);
1350     !!!parse-error (type => 'end tag attribute');
1351     } else {
1352     ## NOTE: This state should never be reached.
1353     !!!cp (99);
1354     }
1355 wakaba 1.15
1356     $self->{state} = DATA_STATE;
1357     $self->{s_kwd} = '';
1358     ## reconsume
1359     !!!emit ($self->{ct}); # end tag
1360     redo A;
1361     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1362     ## XML5: No parse error above; not defined yet.
1363     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1364     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1365     ## Reconsume.
1366     !!!emit ($self->{ct}); # ATTLIST
1367     redo A;
1368 wakaba 1.1 } else {
1369     die "$0: $self->{ct}->{type}: Unknown token type";
1370     }
1371     } else {
1372 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1373 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1374     !!!cp (100);
1375     ## XML5: Not a parse error.
1376     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1377     } else {
1378     !!!cp (100.1);
1379     }
1380 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1381     $self->{read_until}->($self->{ca}->{value},
1382 wakaba 1.25 qq["&<\x09\x0C\x20],
1383 wakaba 1.1 length $self->{ca}->{value});
1384    
1385     ## Stay in the state
1386     !!!next-input-character;
1387     redo A;
1388     }
1389     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1390 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1391     ## ATTLIST attribute value single quoted state".
1392 wakaba 1.11
1393 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1394 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1395     !!!cp (101.1);
1396     ## XML5: "DOCTYPE ATTLIST name after state".
1397     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1398     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1399     } else {
1400     !!!cp (101);
1401     ## XML5: "Before attribute name state" (sic).
1402     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1403     }
1404 wakaba 1.1 !!!next-input-character;
1405     redo A;
1406     } elsif ($self->{nc} == 0x0026) { # &
1407     !!!cp (102);
1408 wakaba 1.11 ## XML5: Not defined yet.
1409    
1410 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1411     ## "entity in attribute value state". In this implementation, the
1412     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1413     ## implementation of the "consume a character reference" algorithm.
1414     $self->{entity_add} = 0x0027; # '
1415     $self->{prev_state} = $self->{state};
1416     $self->{state} = ENTITY_STATE;
1417     !!!next-input-character;
1418     redo A;
1419 wakaba 1.25 } elsif ($self->{is_xml} and
1420     $is_space->{$self->{nc}}) {
1421     !!!cp (103.1);
1422     $self->{ca}->{value} .= ' ';
1423     ## Stay in the state.
1424     !!!next-input-character;
1425     redo A;
1426 wakaba 1.1 } elsif ($self->{nc} == -1) {
1427     !!!parse-error (type => 'unclosed attribute value');
1428     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1429     !!!cp (103);
1430     $self->{last_stag_name} = $self->{ct}->{tag_name};
1431 wakaba 1.15
1432     $self->{state} = DATA_STATE;
1433     $self->{s_kwd} = '';
1434     ## reconsume
1435     !!!emit ($self->{ct}); # start tag
1436     redo A;
1437 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1438     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1439     if ($self->{ct}->{attributes}) {
1440     !!!cp (104);
1441     !!!parse-error (type => 'end tag attribute');
1442     } else {
1443     ## NOTE: This state should never be reached.
1444     !!!cp (105);
1445     }
1446 wakaba 1.15
1447     $self->{state} = DATA_STATE;
1448     $self->{s_kwd} = '';
1449     ## reconsume
1450     !!!emit ($self->{ct}); # end tag
1451     redo A;
1452     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1453     ## XML5: No parse error above; not defined yet.
1454     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1455     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1456     ## Reconsume.
1457     !!!emit ($self->{ct}); # ATTLIST
1458     redo A;
1459 wakaba 1.1 } else {
1460     die "$0: $self->{ct}->{type}: Unknown token type";
1461     }
1462     } else {
1463 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1464 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1465     !!!cp (106);
1466     ## XML5: Not a parse error.
1467     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1468     } else {
1469     !!!cp (106.1);
1470     }
1471 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1472     $self->{read_until}->($self->{ca}->{value},
1473 wakaba 1.25 qq['&<\x09\x0C\x20],
1474 wakaba 1.1 length $self->{ca}->{value});
1475    
1476     ## Stay in the state
1477     !!!next-input-character;
1478     redo A;
1479     }
1480     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1481 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1482    
1483 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1484 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1485     !!!cp (107.1);
1486     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1487     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1488     } else {
1489     !!!cp (107);
1490     ## XML5: "Tag attribute name before state".
1491     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1492     }
1493 wakaba 1.1 !!!next-input-character;
1494     redo A;
1495     } elsif ($self->{nc} == 0x0026) { # &
1496     !!!cp (108);
1497 wakaba 1.11
1498     ## XML5: Not defined yet.
1499    
1500 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1501     ## "entity in attribute value state". In this implementation, the
1502     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1503     ## implementation of the "consume a character reference" algorithm.
1504     $self->{entity_add} = -1;
1505     $self->{prev_state} = $self->{state};
1506     $self->{state} = ENTITY_STATE;
1507     !!!next-input-character;
1508     redo A;
1509     } elsif ($self->{nc} == 0x003E) { # >
1510     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1511     !!!cp (109);
1512     $self->{last_stag_name} = $self->{ct}->{tag_name};
1513 wakaba 1.15
1514     $self->{state} = DATA_STATE;
1515     $self->{s_kwd} = '';
1516     !!!next-input-character;
1517     !!!emit ($self->{ct}); # start tag
1518     redo A;
1519 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1520     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521     if ($self->{ct}->{attributes}) {
1522     !!!cp (110);
1523     !!!parse-error (type => 'end tag attribute');
1524     } else {
1525     ## NOTE: This state should never be reached.
1526     !!!cp (111);
1527     }
1528 wakaba 1.15
1529     $self->{state} = DATA_STATE;
1530     $self->{s_kwd} = '';
1531     !!!next-input-character;
1532     !!!emit ($self->{ct}); # end tag
1533     redo A;
1534     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1535     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1536     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1537     !!!next-input-character;
1538     !!!emit ($self->{ct}); # ATTLIST
1539     redo A;
1540 wakaba 1.1 } else {
1541     die "$0: $self->{ct}->{type}: Unknown token type";
1542     }
1543     } elsif ($self->{nc} == -1) {
1544     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1545     !!!cp (112);
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1548 wakaba 1.15
1549     $self->{state} = DATA_STATE;
1550     $self->{s_kwd} = '';
1551     ## reconsume
1552     !!!emit ($self->{ct}); # start tag
1553     redo A;
1554 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1555 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1556 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1557     if ($self->{ct}->{attributes}) {
1558     !!!cp (113);
1559     !!!parse-error (type => 'end tag attribute');
1560     } else {
1561     ## NOTE: This state should never be reached.
1562     !!!cp (114);
1563     }
1564 wakaba 1.15
1565     $self->{state} = DATA_STATE;
1566     $self->{s_kwd} = '';
1567     ## reconsume
1568     !!!emit ($self->{ct}); # end tag
1569     redo A;
1570     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1571     !!!parse-error (type => 'unclosed md'); ## TODO: type
1572     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1573     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1574     ## Reconsume.
1575     !!!emit ($self->{ct}); # ATTLIST
1576     redo A;
1577 wakaba 1.1 } else {
1578     die "$0: $self->{ct}->{type}: Unknown token type";
1579     }
1580     } else {
1581     if ({
1582     0x0022 => 1, # "
1583     0x0027 => 1, # '
1584     0x003D => 1, # =
1585 wakaba 1.26 0x003C => 1, # <
1586 wakaba 1.1 }->{$self->{nc}}) {
1587     !!!cp (115);
1588 wakaba 1.11 ## XML5: Not a parse error.
1589 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1590     } else {
1591     !!!cp (116);
1592     }
1593     $self->{ca}->{value} .= chr ($self->{nc});
1594     $self->{read_until}->($self->{ca}->{value},
1595 wakaba 1.25 qq["'=& \x09\x0C>],
1596 wakaba 1.1 length $self->{ca}->{value});
1597    
1598     ## Stay in the state
1599     !!!next-input-character;
1600     redo A;
1601     }
1602     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1603     if ($is_space->{$self->{nc}}) {
1604     !!!cp (118);
1605     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1606     !!!next-input-character;
1607     redo A;
1608     } elsif ($self->{nc} == 0x003E) { # >
1609     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1610     !!!cp (119);
1611     $self->{last_stag_name} = $self->{ct}->{tag_name};
1612     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1613     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1614     if ($self->{ct}->{attributes}) {
1615     !!!cp (120);
1616     !!!parse-error (type => 'end tag attribute');
1617     } else {
1618     ## NOTE: This state should never be reached.
1619     !!!cp (121);
1620     }
1621     } else {
1622     die "$0: $self->{ct}->{type}: Unknown token type";
1623     }
1624     $self->{state} = DATA_STATE;
1625 wakaba 1.5 $self->{s_kwd} = '';
1626 wakaba 1.1 !!!next-input-character;
1627    
1628     !!!emit ($self->{ct}); # start tag or end tag
1629    
1630     redo A;
1631     } elsif ($self->{nc} == 0x002F) { # /
1632     !!!cp (122);
1633     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1634     !!!next-input-character;
1635     redo A;
1636     } elsif ($self->{nc} == -1) {
1637     !!!parse-error (type => 'unclosed tag');
1638     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1639     !!!cp (122.3);
1640     $self->{last_stag_name} = $self->{ct}->{tag_name};
1641     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1642     if ($self->{ct}->{attributes}) {
1643     !!!cp (122.1);
1644     !!!parse-error (type => 'end tag attribute');
1645     } else {
1646     ## NOTE: This state should never be reached.
1647     !!!cp (122.2);
1648     }
1649     } else {
1650     die "$0: $self->{ct}->{type}: Unknown token type";
1651     }
1652     $self->{state} = DATA_STATE;
1653 wakaba 1.5 $self->{s_kwd} = '';
1654 wakaba 1.1 ## Reconsume.
1655     !!!emit ($self->{ct}); # start tag or end tag
1656     redo A;
1657     } else {
1658     !!!cp ('124.1');
1659     !!!parse-error (type => 'no space between attributes');
1660     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1661     ## reconsume
1662     redo A;
1663     }
1664     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1665 wakaba 1.11 ## XML5: "Empty tag state".
1666    
1667 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1668     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1669     !!!cp ('124.2');
1670     !!!parse-error (type => 'nestc', token => $self->{ct});
1671     ## TODO: Different type than slash in start tag
1672     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1673     if ($self->{ct}->{attributes}) {
1674     !!!cp ('124.4');
1675     !!!parse-error (type => 'end tag attribute');
1676     } else {
1677     !!!cp ('124.5');
1678     }
1679     ## TODO: Test |<title></title/>|
1680     } else {
1681     !!!cp ('124.3');
1682     $self->{self_closing} = 1;
1683     }
1684    
1685     $self->{state} = DATA_STATE;
1686 wakaba 1.5 $self->{s_kwd} = '';
1687 wakaba 1.1 !!!next-input-character;
1688    
1689     !!!emit ($self->{ct}); # start tag or end tag
1690    
1691     redo A;
1692     } elsif ($self->{nc} == -1) {
1693     !!!parse-error (type => 'unclosed tag');
1694     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1695     !!!cp (124.7);
1696     $self->{last_stag_name} = $self->{ct}->{tag_name};
1697     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1698     if ($self->{ct}->{attributes}) {
1699     !!!cp (124.5);
1700     !!!parse-error (type => 'end tag attribute');
1701     } else {
1702     ## NOTE: This state should never be reached.
1703     !!!cp (124.6);
1704     }
1705     } else {
1706     die "$0: $self->{ct}->{type}: Unknown token type";
1707     }
1708 wakaba 1.11 ## XML5: "Tag attribute name before state".
1709 wakaba 1.1 $self->{state} = DATA_STATE;
1710 wakaba 1.5 $self->{s_kwd} = '';
1711 wakaba 1.1 ## Reconsume.
1712     !!!emit ($self->{ct}); # start tag or end tag
1713     redo A;
1714     } else {
1715     !!!cp ('124.4');
1716     !!!parse-error (type => 'nestc');
1717     ## TODO: This error type is wrong.
1718     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1719     ## Reconsume.
1720     redo A;
1721     }
1722     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1723 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1724    
1725 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1726     ## consumes characters one-by-one basis.
1727    
1728     if ($self->{nc} == 0x003E) { # >
1729 wakaba 1.13 if ($self->{in_subset}) {
1730     !!!cp (123);
1731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1732     } else {
1733     !!!cp (124);
1734     $self->{state} = DATA_STATE;
1735     $self->{s_kwd} = '';
1736     }
1737 wakaba 1.1 !!!next-input-character;
1738    
1739     !!!emit ($self->{ct}); # comment
1740     redo A;
1741     } elsif ($self->{nc} == -1) {
1742 wakaba 1.13 if ($self->{in_subset}) {
1743     !!!cp (125.1);
1744     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1745     } else {
1746     !!!cp (125);
1747     $self->{state} = DATA_STATE;
1748     $self->{s_kwd} = '';
1749     }
1750 wakaba 1.1 ## reconsume
1751    
1752     !!!emit ($self->{ct}); # comment
1753     redo A;
1754     } else {
1755     !!!cp (126);
1756     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1757     $self->{read_until}->($self->{ct}->{data},
1758     q[>],
1759     length $self->{ct}->{data});
1760    
1761     ## Stay in the state.
1762     !!!next-input-character;
1763     redo A;
1764     }
1765     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1766 wakaba 1.14 ## XML5: "Markup declaration state".
1767 wakaba 1.1
1768     if ($self->{nc} == 0x002D) { # -
1769     !!!cp (133);
1770     $self->{state} = MD_HYPHEN_STATE;
1771     !!!next-input-character;
1772     redo A;
1773     } elsif ($self->{nc} == 0x0044 or # D
1774     $self->{nc} == 0x0064) { # d
1775     ## ASCII case-insensitive.
1776     !!!cp (130);
1777     $self->{state} = MD_DOCTYPE_STATE;
1778 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1779 wakaba 1.1 !!!next-input-character;
1780     redo A;
1781 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1782     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1783     $self->{is_xml}) and
1784 wakaba 1.1 $self->{nc} == 0x005B) { # [
1785     !!!cp (135.4);
1786     $self->{state} = MD_CDATA_STATE;
1787 wakaba 1.12 $self->{kwd} = '[';
1788 wakaba 1.1 !!!next-input-character;
1789     redo A;
1790     } else {
1791     !!!cp (136);
1792     }
1793    
1794     !!!parse-error (type => 'bogus comment',
1795     line => $self->{line_prev},
1796     column => $self->{column_prev} - 1);
1797     ## Reconsume.
1798     $self->{state} = BOGUS_COMMENT_STATE;
1799     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1800     line => $self->{line_prev},
1801     column => $self->{column_prev} - 1,
1802     };
1803     redo A;
1804     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1805     if ($self->{nc} == 0x002D) { # -
1806     !!!cp (127);
1807     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1808     line => $self->{line_prev},
1809     column => $self->{column_prev} - 2,
1810     };
1811 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1812 wakaba 1.1 !!!next-input-character;
1813     redo A;
1814     } else {
1815     !!!cp (128);
1816     !!!parse-error (type => 'bogus comment',
1817     line => $self->{line_prev},
1818     column => $self->{column_prev} - 2);
1819     $self->{state} = BOGUS_COMMENT_STATE;
1820     ## Reconsume.
1821     $self->{ct} = {type => COMMENT_TOKEN,
1822     data => '-',
1823     line => $self->{line_prev},
1824     column => $self->{column_prev} - 2,
1825     };
1826     redo A;
1827     }
1828     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1829     ## ASCII case-insensitive.
1830     if ($self->{nc} == [
1831     undef,
1832     0x004F, # O
1833     0x0043, # C
1834     0x0054, # T
1835     0x0059, # Y
1836     0x0050, # P
1837 wakaba 1.12 ]->[length $self->{kwd}] or
1838 wakaba 1.1 $self->{nc} == [
1839     undef,
1840     0x006F, # o
1841     0x0063, # c
1842     0x0074, # t
1843     0x0079, # y
1844     0x0070, # p
1845 wakaba 1.12 ]->[length $self->{kwd}]) {
1846 wakaba 1.1 !!!cp (131);
1847     ## Stay in the state.
1848 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1849 wakaba 1.1 !!!next-input-character;
1850     redo A;
1851 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1852 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1853     $self->{nc} == 0x0065)) { # e
1854 wakaba 1.12 if ($self->{is_xml} and
1855     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1856 wakaba 1.10 !!!cp (129);
1857     ## XML5: case-sensitive.
1858     !!!parse-error (type => 'lowercase keyword', ## TODO
1859     text => 'DOCTYPE',
1860     line => $self->{line_prev},
1861     column => $self->{column_prev} - 5);
1862     } else {
1863     !!!cp (129.1);
1864     }
1865 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1866     $self->{ct} = {type => DOCTYPE_TOKEN,
1867     quirks => 1,
1868     line => $self->{line_prev},
1869     column => $self->{column_prev} - 7,
1870     };
1871     !!!next-input-character;
1872     redo A;
1873     } else {
1874     !!!cp (132);
1875     !!!parse-error (type => 'bogus comment',
1876     line => $self->{line_prev},
1877 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1878 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1879     ## Reconsume.
1880     $self->{ct} = {type => COMMENT_TOKEN,
1881 wakaba 1.12 data => $self->{kwd},
1882 wakaba 1.1 line => $self->{line_prev},
1883 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1884 wakaba 1.1 };
1885     redo A;
1886     }
1887     } elsif ($self->{state} == MD_CDATA_STATE) {
1888     if ($self->{nc} == {
1889     '[' => 0x0043, # C
1890     '[C' => 0x0044, # D
1891     '[CD' => 0x0041, # A
1892     '[CDA' => 0x0054, # T
1893     '[CDAT' => 0x0041, # A
1894 wakaba 1.12 }->{$self->{kwd}}) {
1895 wakaba 1.1 !!!cp (135.1);
1896     ## Stay in the state.
1897 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1898 wakaba 1.1 !!!next-input-character;
1899     redo A;
1900 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1901 wakaba 1.1 $self->{nc} == 0x005B) { # [
1902 wakaba 1.6 if ($self->{is_xml} and
1903     not $self->{tainted} and
1904     @{$self->{open_elements} or []} == 0) {
1905 wakaba 1.8 !!!cp (135.2);
1906 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev} - 7);
1909     $self->{tainted} = 1;
1910 wakaba 1.8 } else {
1911     !!!cp (135.21);
1912 wakaba 1.6 }
1913    
1914 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1915     data => '',
1916     line => $self->{line_prev},
1917     column => $self->{column_prev} - 7};
1918     $self->{state} = CDATA_SECTION_STATE;
1919     !!!next-input-character;
1920     redo A;
1921     } else {
1922     !!!cp (135.3);
1923     !!!parse-error (type => 'bogus comment',
1924     line => $self->{line_prev},
1925 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1926 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1927     ## Reconsume.
1928     $self->{ct} = {type => COMMENT_TOKEN,
1929 wakaba 1.12 data => $self->{kwd},
1930 wakaba 1.1 line => $self->{line_prev},
1931 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1932 wakaba 1.1 };
1933     redo A;
1934     }
1935     } elsif ($self->{state} == COMMENT_START_STATE) {
1936     if ($self->{nc} == 0x002D) { # -
1937     !!!cp (137);
1938     $self->{state} = COMMENT_START_DASH_STATE;
1939     !!!next-input-character;
1940     redo A;
1941     } elsif ($self->{nc} == 0x003E) { # >
1942     !!!parse-error (type => 'bogus comment');
1943 wakaba 1.13 if ($self->{in_subset}) {
1944     !!!cp (138.1);
1945     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1946     } else {
1947     !!!cp (138);
1948     $self->{state} = DATA_STATE;
1949     $self->{s_kwd} = '';
1950     }
1951 wakaba 1.1 !!!next-input-character;
1952    
1953     !!!emit ($self->{ct}); # comment
1954    
1955     redo A;
1956     } elsif ($self->{nc} == -1) {
1957     !!!parse-error (type => 'unclosed comment');
1958 wakaba 1.13 if ($self->{in_subset}) {
1959     !!!cp (139.1);
1960     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1961     } else {
1962     !!!cp (139);
1963     $self->{state} = DATA_STATE;
1964     $self->{s_kwd} = '';
1965     }
1966 wakaba 1.1 ## reconsume
1967    
1968     !!!emit ($self->{ct}); # comment
1969    
1970     redo A;
1971     } else {
1972     !!!cp (140);
1973     $self->{ct}->{data} # comment
1974     .= chr ($self->{nc});
1975     $self->{state} = COMMENT_STATE;
1976     !!!next-input-character;
1977     redo A;
1978     }
1979     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1980     if ($self->{nc} == 0x002D) { # -
1981     !!!cp (141);
1982     $self->{state} = COMMENT_END_STATE;
1983     !!!next-input-character;
1984     redo A;
1985     } elsif ($self->{nc} == 0x003E) { # >
1986     !!!parse-error (type => 'bogus comment');
1987 wakaba 1.13 if ($self->{in_subset}) {
1988     !!!cp (142.1);
1989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1990     } else {
1991     !!!cp (142);
1992     $self->{state} = DATA_STATE;
1993     $self->{s_kwd} = '';
1994     }
1995 wakaba 1.1 !!!next-input-character;
1996    
1997     !!!emit ($self->{ct}); # comment
1998    
1999     redo A;
2000     } elsif ($self->{nc} == -1) {
2001     !!!parse-error (type => 'unclosed comment');
2002 wakaba 1.13 if ($self->{in_subset}) {
2003     !!!cp (143.1);
2004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005     } else {
2006     !!!cp (143);
2007     $self->{state} = DATA_STATE;
2008     $self->{s_kwd} = '';
2009     }
2010 wakaba 1.1 ## reconsume
2011    
2012     !!!emit ($self->{ct}); # comment
2013    
2014     redo A;
2015     } else {
2016     !!!cp (144);
2017     $self->{ct}->{data} # comment
2018     .= '-' . chr ($self->{nc});
2019     $self->{state} = COMMENT_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     }
2023     } elsif ($self->{state} == COMMENT_STATE) {
2024 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2025    
2026 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2027     !!!cp (145);
2028     $self->{state} = COMMENT_END_DASH_STATE;
2029     !!!next-input-character;
2030     redo A;
2031     } elsif ($self->{nc} == -1) {
2032     !!!parse-error (type => 'unclosed comment');
2033 wakaba 1.13 if ($self->{in_subset}) {
2034     !!!cp (146.1);
2035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2036     } else {
2037     !!!cp (146);
2038     $self->{state} = DATA_STATE;
2039     $self->{s_kwd} = '';
2040     }
2041 wakaba 1.1 ## reconsume
2042    
2043     !!!emit ($self->{ct}); # comment
2044    
2045     redo A;
2046     } else {
2047     !!!cp (147);
2048     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2049     $self->{read_until}->($self->{ct}->{data},
2050     q[-],
2051     length $self->{ct}->{data});
2052    
2053     ## Stay in the state
2054     !!!next-input-character;
2055     redo A;
2056     }
2057     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2058 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2059 wakaba 1.10
2060 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2061     !!!cp (148);
2062     $self->{state} = COMMENT_END_STATE;
2063     !!!next-input-character;
2064     redo A;
2065     } elsif ($self->{nc} == -1) {
2066     !!!parse-error (type => 'unclosed comment');
2067 wakaba 1.13 if ($self->{in_subset}) {
2068     !!!cp (149.1);
2069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2070     } else {
2071     !!!cp (149);
2072     $self->{state} = DATA_STATE;
2073     $self->{s_kwd} = '';
2074     }
2075 wakaba 1.1 ## reconsume
2076    
2077     !!!emit ($self->{ct}); # comment
2078    
2079     redo A;
2080     } else {
2081     !!!cp (150);
2082     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2083     $self->{state} = COMMENT_STATE;
2084     !!!next-input-character;
2085     redo A;
2086     }
2087 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2088     $self->{state} == COMMENT_END_BANG_STATE) {
2089 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2090 wakaba 1.31 ## (No comment end bang state.)
2091 wakaba 1.14
2092 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2093 wakaba 1.13 if ($self->{in_subset}) {
2094     !!!cp (151.1);
2095     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2096     } else {
2097     !!!cp (151);
2098     $self->{state} = DATA_STATE;
2099     $self->{s_kwd} = '';
2100     }
2101 wakaba 1.1 !!!next-input-character;
2102    
2103     !!!emit ($self->{ct}); # comment
2104    
2105     redo A;
2106     } elsif ($self->{nc} == 0x002D) { # -
2107 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2108     !!!cp (154.3);
2109     $self->{ct}->{data} .= '--!'; # comment
2110     $self->{state} = COMMENT_END_DASH_STATE;
2111     } else {
2112     !!!cp (152);
2113     ## XML5: Not a parse error.
2114     !!!parse-error (type => 'dash in comment',
2115     line => $self->{line_prev},
2116     column => $self->{column_prev});
2117     $self->{ct}->{data} .= '-'; # comment
2118     ## Stay in the state
2119     }
2120     !!!next-input-character;
2121     redo A;
2122 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2123     $is_space->{$self->{nc}}) {
2124     !!!cp (152.1);
2125     !!!parse-error (type => 'comment end space'); # XXX error type
2126     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2127     $self->{state} = COMMENT_END_SPACE_STATE;
2128     !!!next-input-character;
2129     redo A;
2130     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2131     $self->{nc} == 0x0021) { # !
2132     !!!cp (152.2);
2133 wakaba 1.31 !!!parse-error (type => 'comment end bang'); # XXX error type
2134     $self->{state} = COMMENT_END_BANG_STATE;
2135 wakaba 1.1 !!!next-input-character;
2136     redo A;
2137     } elsif ($self->{nc} == -1) {
2138     !!!parse-error (type => 'unclosed comment');
2139 wakaba 1.13 if ($self->{in_subset}) {
2140     !!!cp (153.1);
2141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2142     } else {
2143     !!!cp (153);
2144     $self->{state} = DATA_STATE;
2145     $self->{s_kwd} = '';
2146     }
2147 wakaba 1.31 ## Reconsume.
2148 wakaba 1.1
2149     !!!emit ($self->{ct}); # comment
2150    
2151     redo A;
2152     } else {
2153     !!!cp (154);
2154 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2155     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2156     } else {
2157     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2158     }
2159 wakaba 1.1 $self->{state} = COMMENT_STATE;
2160     !!!next-input-character;
2161     redo A;
2162     }
2163 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2164     ## XML5: Not exist.
2165    
2166     if ($self->{nc} == 0x003E) { # >
2167     if ($self->{in_subset}) {
2168     !!!cp (154.4);
2169     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2170     } else {
2171     !!!cp (154.5);
2172     $self->{state} = DATA_STATE;
2173     $self->{s_kwd} = '';
2174     }
2175     !!!next-input-character;
2176    
2177     !!!emit ($self->{ct}); # comment
2178    
2179     redo A;
2180     } elsif ($is_space->{$self->{nc}}) {
2181     !!!cp (154.6);
2182     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2183     ## Stay in the state.
2184     !!!next-input-character;
2185     redo A;
2186     } elsif ($self->{nc} == -1) {
2187     !!!parse-error (type => 'unclosed comment');
2188     if ($self->{in_subset}) {
2189     !!!cp (154.7);
2190     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2191     } else {
2192     !!!cp (154.8);
2193     $self->{state} = DATA_STATE;
2194     $self->{s_kwd} = '';
2195     }
2196     ## Reconsume.
2197    
2198     !!!emit ($self->{ct}); # comment
2199    
2200     redo A;
2201     } else {
2202     !!!cp (154.9);
2203     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2204     $self->{state} = COMMENT_STATE;
2205     !!!next-input-character;
2206     redo A;
2207     }
2208 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
2209     if ($is_space->{$self->{nc}}) {
2210     !!!cp (155);
2211     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2212     !!!next-input-character;
2213     redo A;
2214 wakaba 1.28 } elsif ($self->{nc} == -1) {
2215     !!!cp (155.1);
2216     !!!parse-error (type => 'unclosed DOCTYPE');
2217     $self->{ct}->{quirks} = 1;
2218    
2219     $self->{state} = DATA_STATE;
2220     ## Reconsume.
2221     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2222    
2223     redo A;
2224 wakaba 1.1 } else {
2225     !!!cp (156);
2226 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2227 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2228     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2229     ## reconsume
2230     redo A;
2231     }
2232     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2233 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2234    
2235 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2236     !!!cp (157);
2237     ## Stay in the state
2238     !!!next-input-character;
2239     redo A;
2240     } elsif ($self->{nc} == 0x003E) { # >
2241     !!!cp (158);
2242 wakaba 1.12 ## XML5: No parse error.
2243 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2244     $self->{state} = DATA_STATE;
2245 wakaba 1.5 $self->{s_kwd} = '';
2246 wakaba 1.1 !!!next-input-character;
2247    
2248     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2249    
2250     redo A;
2251 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2252     !!!cp (158.1);
2253     $self->{ct}->{name} # DOCTYPE
2254     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2255     delete $self->{ct}->{quirks};
2256     $self->{state} = DOCTYPE_NAME_STATE;
2257     !!!next-input-character;
2258     redo A;
2259 wakaba 1.1 } elsif ($self->{nc} == -1) {
2260     !!!cp (159);
2261     !!!parse-error (type => 'no DOCTYPE name');
2262     $self->{state} = DATA_STATE;
2263 wakaba 1.5 $self->{s_kwd} = '';
2264 wakaba 1.1 ## reconsume
2265    
2266     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2267    
2268     redo A;
2269 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2270     !!!cp (159.1);
2271     !!!parse-error (type => 'no DOCTYPE name');
2272     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2273 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2274     $self->{in_subset} = 1;
2275 wakaba 1.12 !!!next-input-character;
2276 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2277 wakaba 1.12 redo A;
2278 wakaba 1.1 } else {
2279     !!!cp (160);
2280     $self->{ct}->{name} = chr $self->{nc};
2281     delete $self->{ct}->{quirks};
2282     $self->{state} = DOCTYPE_NAME_STATE;
2283     !!!next-input-character;
2284     redo A;
2285     }
2286     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2287 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2288    
2289     ## ISSUE: Redundant "First," in the spec.
2290    
2291 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2292     !!!cp (161);
2293     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2294     !!!next-input-character;
2295     redo A;
2296     } elsif ($self->{nc} == 0x003E) { # >
2297     !!!cp (162);
2298     $self->{state} = DATA_STATE;
2299 wakaba 1.5 $self->{s_kwd} = '';
2300 wakaba 1.1 !!!next-input-character;
2301    
2302     !!!emit ($self->{ct}); # DOCTYPE
2303    
2304     redo A;
2305 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2306     !!!cp (162.1);
2307     $self->{ct}->{name} # DOCTYPE
2308     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2309     delete $self->{ct}->{quirks};
2310     ## Stay in the state.
2311     !!!next-input-character;
2312     redo A;
2313 wakaba 1.1 } elsif ($self->{nc} == -1) {
2314     !!!cp (163);
2315     !!!parse-error (type => 'unclosed DOCTYPE');
2316     $self->{state} = DATA_STATE;
2317 wakaba 1.5 $self->{s_kwd} = '';
2318 wakaba 1.1 ## reconsume
2319    
2320     $self->{ct}->{quirks} = 1;
2321     !!!emit ($self->{ct}); # DOCTYPE
2322    
2323     redo A;
2324 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2325     !!!cp (163.1);
2326     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2327 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2328     $self->{in_subset} = 1;
2329 wakaba 1.12 !!!next-input-character;
2330 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2331 wakaba 1.12 redo A;
2332 wakaba 1.1 } else {
2333     !!!cp (164);
2334 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2335     ## Stay in the state.
2336 wakaba 1.1 !!!next-input-character;
2337     redo A;
2338     }
2339     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2340 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2341     ## state", but implemented differently.
2342    
2343 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2344     !!!cp (165);
2345     ## Stay in the state
2346     !!!next-input-character;
2347     redo A;
2348     } elsif ($self->{nc} == 0x003E) { # >
2349 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2350     !!!cp (166);
2351     $self->{state} = DATA_STATE;
2352     $self->{s_kwd} = '';
2353     } else {
2354     !!!cp (166.1);
2355     !!!parse-error (type => 'no md def'); ## TODO: type
2356     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2357     }
2358    
2359 wakaba 1.1 !!!next-input-character;
2360 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2361 wakaba 1.1 redo A;
2362     } elsif ($self->{nc} == -1) {
2363 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2364     !!!cp (167);
2365     !!!parse-error (type => 'unclosed DOCTYPE');
2366     $self->{state} = DATA_STATE;
2367     $self->{s_kwd} = '';
2368     $self->{ct}->{quirks} = 1;
2369     } else {
2370     !!!cp (167.12);
2371     !!!parse-error (type => 'unclosed md'); ## TODO: type
2372     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2373     }
2374    
2375     ## Reconsume.
2376     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2377 wakaba 1.1 redo A;
2378     } elsif ($self->{nc} == 0x0050 or # P
2379     $self->{nc} == 0x0070) { # p
2380 wakaba 1.12 !!!cp (167.1);
2381 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2382 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2383 wakaba 1.1 !!!next-input-character;
2384     redo A;
2385     } elsif ($self->{nc} == 0x0053 or # S
2386     $self->{nc} == 0x0073) { # s
2387 wakaba 1.12 !!!cp (167.2);
2388 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2389 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2390     !!!next-input-character;
2391     redo A;
2392 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2393     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2394     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2395     !!!cp (167.21);
2396     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2397     $self->{ct}->{value} = ''; # ENTITY
2398     !!!next-input-character;
2399     redo A;
2400     } elsif ($self->{nc} == 0x0027 and # '
2401     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2402     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2403     !!!cp (167.22);
2404     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2405     $self->{ct}->{value} = ''; # ENTITY
2406     !!!next-input-character;
2407     redo A;
2408 wakaba 1.16 } elsif ($self->{is_xml} and
2409     $self->{ct}->{type} == DOCTYPE_TOKEN and
2410     $self->{nc} == 0x005B) { # [
2411 wakaba 1.12 !!!cp (167.3);
2412     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2413     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2414 wakaba 1.13 $self->{in_subset} = 1;
2415 wakaba 1.1 !!!next-input-character;
2416 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2417 wakaba 1.1 redo A;
2418     } else {
2419 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2420    
2421     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2422     !!!cp (180);
2423     $self->{ct}->{quirks} = 1;
2424     $self->{state} = BOGUS_DOCTYPE_STATE;
2425     } else {
2426     !!!cp (180.1);
2427     $self->{state} = BOGUS_MD_STATE;
2428     }
2429 wakaba 1.1
2430     !!!next-input-character;
2431     redo A;
2432     }
2433     } elsif ($self->{state} == PUBLIC_STATE) {
2434     ## ASCII case-insensitive
2435     if ($self->{nc} == [
2436     undef,
2437     0x0055, # U
2438     0x0042, # B
2439     0x004C, # L
2440     0x0049, # I
2441 wakaba 1.12 ]->[length $self->{kwd}] or
2442 wakaba 1.1 $self->{nc} == [
2443     undef,
2444     0x0075, # u
2445     0x0062, # b
2446     0x006C, # l
2447     0x0069, # i
2448 wakaba 1.12 ]->[length $self->{kwd}]) {
2449 wakaba 1.1 !!!cp (175);
2450     ## Stay in the state.
2451 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2452 wakaba 1.1 !!!next-input-character;
2453     redo A;
2454 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2455 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2456     $self->{nc} == 0x0063)) { # c
2457 wakaba 1.12 if ($self->{is_xml} and
2458     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2459     !!!cp (168.1);
2460     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2461     text => 'PUBLIC',
2462     line => $self->{line_prev},
2463     column => $self->{column_prev} - 4);
2464     } else {
2465     !!!cp (168);
2466     }
2467 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2468     !!!next-input-character;
2469     redo A;
2470     } else {
2471 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2472 wakaba 1.1 line => $self->{line_prev},
2473 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2474 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2475     !!!cp (169);
2476     $self->{ct}->{quirks} = 1;
2477     $self->{state} = BOGUS_DOCTYPE_STATE;
2478     } else {
2479     !!!cp (169.1);
2480     $self->{state} = BOGUS_MD_STATE;
2481     }
2482 wakaba 1.1 ## Reconsume.
2483     redo A;
2484     }
2485     } elsif ($self->{state} == SYSTEM_STATE) {
2486     ## ASCII case-insensitive
2487     if ($self->{nc} == [
2488     undef,
2489     0x0059, # Y
2490     0x0053, # S
2491     0x0054, # T
2492     0x0045, # E
2493 wakaba 1.12 ]->[length $self->{kwd}] or
2494 wakaba 1.1 $self->{nc} == [
2495     undef,
2496     0x0079, # y
2497     0x0073, # s
2498     0x0074, # t
2499     0x0065, # e
2500 wakaba 1.12 ]->[length $self->{kwd}]) {
2501 wakaba 1.1 !!!cp (170);
2502     ## Stay in the state.
2503 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2504 wakaba 1.1 !!!next-input-character;
2505     redo A;
2506 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2507 wakaba 1.1 ($self->{nc} == 0x004D or # M
2508     $self->{nc} == 0x006D)) { # m
2509 wakaba 1.12 if ($self->{is_xml} and
2510     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2511     !!!cp (171.1);
2512     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2513     text => 'SYSTEM',
2514     line => $self->{line_prev},
2515     column => $self->{column_prev} - 4);
2516     } else {
2517     !!!cp (171);
2518     }
2519 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2520     !!!next-input-character;
2521     redo A;
2522     } else {
2523 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2524 wakaba 1.1 line => $self->{line_prev},
2525 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2526 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2527     !!!cp (172);
2528     $self->{ct}->{quirks} = 1;
2529     $self->{state} = BOGUS_DOCTYPE_STATE;
2530     } else {
2531     !!!cp (172.1);
2532     $self->{state} = BOGUS_MD_STATE;
2533     }
2534 wakaba 1.1 ## Reconsume.
2535     redo A;
2536     }
2537     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2538     if ($is_space->{$self->{nc}}) {
2539     !!!cp (181);
2540     ## Stay in the state
2541     !!!next-input-character;
2542     redo A;
2543     } elsif ($self->{nc} eq 0x0022) { # "
2544     !!!cp (182);
2545     $self->{ct}->{pubid} = ''; # DOCTYPE
2546     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2547     !!!next-input-character;
2548     redo A;
2549     } elsif ($self->{nc} eq 0x0027) { # '
2550     !!!cp (183);
2551     $self->{ct}->{pubid} = ''; # DOCTYPE
2552     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2553     !!!next-input-character;
2554     redo A;
2555     } elsif ($self->{nc} eq 0x003E) { # >
2556     !!!parse-error (type => 'no PUBLIC literal');
2557 wakaba 1.16
2558     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2559     !!!cp (184);
2560     $self->{state} = DATA_STATE;
2561     $self->{s_kwd} = '';
2562     $self->{ct}->{quirks} = 1;
2563     } else {
2564     !!!cp (184.1);
2565     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2566     }
2567    
2568 wakaba 1.1 !!!next-input-character;
2569 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2570 wakaba 1.1 redo A;
2571     } elsif ($self->{nc} == -1) {
2572 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2573     !!!cp (185);
2574     !!!parse-error (type => 'unclosed DOCTYPE');
2575     $self->{state} = DATA_STATE;
2576     $self->{s_kwd} = '';
2577     $self->{ct}->{quirks} = 1;
2578     } else {
2579     !!!cp (185.1);
2580     !!!parse-error (type => 'unclosed md'); ## TODO: type
2581     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2582     }
2583    
2584 wakaba 1.1 ## reconsume
2585     !!!emit ($self->{ct}); # DOCTYPE
2586     redo A;
2587 wakaba 1.16 } elsif ($self->{is_xml} and
2588     $self->{ct}->{type} == DOCTYPE_TOKEN and
2589     $self->{nc} == 0x005B) { # [
2590 wakaba 1.12 !!!cp (186.1);
2591     !!!parse-error (type => 'no PUBLIC literal');
2592     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2593     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2594 wakaba 1.13 $self->{in_subset} = 1;
2595 wakaba 1.12 !!!next-input-character;
2596 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2597 wakaba 1.12 redo A;
2598 wakaba 1.1 } else {
2599     !!!parse-error (type => 'string after PUBLIC');
2600    
2601 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2602     !!!cp (186);
2603     $self->{ct}->{quirks} = 1;
2604     $self->{state} = BOGUS_DOCTYPE_STATE;
2605     } else {
2606     !!!cp (186.2);
2607     $self->{state} = BOGUS_MD_STATE;
2608     }
2609    
2610 wakaba 1.1 !!!next-input-character;
2611     redo A;
2612     }
2613     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2614     if ($self->{nc} == 0x0022) { # "
2615     !!!cp (187);
2616     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2617     !!!next-input-character;
2618     redo A;
2619     } elsif ($self->{nc} == 0x003E) { # >
2620     !!!parse-error (type => 'unclosed PUBLIC literal');
2621    
2622 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2623     !!!cp (188);
2624     $self->{state} = DATA_STATE;
2625     $self->{s_kwd} = '';
2626     $self->{ct}->{quirks} = 1;
2627     } else {
2628     !!!cp (188.1);
2629     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2630     }
2631    
2632 wakaba 1.1 !!!next-input-character;
2633 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2634 wakaba 1.1 redo A;
2635     } elsif ($self->{nc} == -1) {
2636     !!!parse-error (type => 'unclosed PUBLIC literal');
2637    
2638 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2639     !!!cp (189);
2640     $self->{state} = DATA_STATE;
2641     $self->{s_kwd} = '';
2642     $self->{ct}->{quirks} = 1;
2643     } else {
2644     !!!cp (189.1);
2645     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2646     }
2647    
2648     ## Reconsume.
2649 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2650     redo A;
2651     } else {
2652     !!!cp (190);
2653 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2654 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2655     length $self->{ct}->{pubid});
2656    
2657     ## Stay in the state
2658     !!!next-input-character;
2659     redo A;
2660     }
2661     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2662     if ($self->{nc} == 0x0027) { # '
2663     !!!cp (191);
2664     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2665     !!!next-input-character;
2666     redo A;
2667     } elsif ($self->{nc} == 0x003E) { # >
2668     !!!parse-error (type => 'unclosed PUBLIC literal');
2669    
2670 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2671     !!!cp (192);
2672     $self->{state} = DATA_STATE;
2673     $self->{s_kwd} = '';
2674     $self->{ct}->{quirks} = 1;
2675     } else {
2676     !!!cp (192.1);
2677     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678     }
2679    
2680 wakaba 1.1 !!!next-input-character;
2681 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2682 wakaba 1.1 redo A;
2683     } elsif ($self->{nc} == -1) {
2684     !!!parse-error (type => 'unclosed PUBLIC literal');
2685    
2686 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2687     !!!cp (193);
2688     $self->{state} = DATA_STATE;
2689     $self->{s_kwd} = '';
2690     $self->{ct}->{quirks} = 1;
2691     } else {
2692     !!!cp (193.1);
2693     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2694     }
2695    
2696 wakaba 1.1 ## reconsume
2697 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2698 wakaba 1.1 redo A;
2699     } else {
2700     !!!cp (194);
2701 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2702 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2703     length $self->{ct}->{pubid});
2704    
2705     ## Stay in the state
2706     !!!next-input-character;
2707     redo A;
2708     }
2709     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2710     if ($is_space->{$self->{nc}}) {
2711     !!!cp (195);
2712     ## Stay in the state
2713     !!!next-input-character;
2714     redo A;
2715     } elsif ($self->{nc} == 0x0022) { # "
2716     !!!cp (196);
2717 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2718 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2719     !!!next-input-character;
2720     redo A;
2721     } elsif ($self->{nc} == 0x0027) { # '
2722     !!!cp (197);
2723 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2724 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2725     !!!next-input-character;
2726     redo A;
2727     } elsif ($self->{nc} == 0x003E) { # >
2728 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2729     if ($self->{is_xml}) {
2730     !!!cp (198.1);
2731     !!!parse-error (type => 'no SYSTEM literal');
2732     } else {
2733     !!!cp (198);
2734     }
2735     $self->{state} = DATA_STATE;
2736     $self->{s_kwd} = '';
2737 wakaba 1.12 } else {
2738 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2739     !!!cp (198.2);
2740     } else {
2741     !!!cp (198.3);
2742     !!!parse-error (type => 'no SYSTEM literal');
2743     }
2744     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2745 wakaba 1.12 }
2746 wakaba 1.16
2747 wakaba 1.1 !!!next-input-character;
2748 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2749 wakaba 1.1 redo A;
2750     } elsif ($self->{nc} == -1) {
2751 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2752     !!!cp (199);
2753     !!!parse-error (type => 'unclosed DOCTYPE');
2754    
2755     $self->{state} = DATA_STATE;
2756     $self->{s_kwd} = '';
2757     $self->{ct}->{quirks} = 1;
2758     } else {
2759     !!!parse-error (type => 'unclosed md'); ## TODO: type
2760     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2761     }
2762    
2763 wakaba 1.1 ## reconsume
2764 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2765 wakaba 1.1 redo A;
2766 wakaba 1.16 } elsif ($self->{is_xml} and
2767     $self->{ct}->{type} == DOCTYPE_TOKEN and
2768     $self->{nc} == 0x005B) { # [
2769 wakaba 1.12 !!!cp (200.1);
2770     !!!parse-error (type => 'no SYSTEM literal');
2771     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2772     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2773 wakaba 1.13 $self->{in_subset} = 1;
2774 wakaba 1.12 !!!next-input-character;
2775 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2776 wakaba 1.12 redo A;
2777 wakaba 1.1 } else {
2778     !!!parse-error (type => 'string after PUBLIC literal');
2779    
2780 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2781     !!!cp (200);
2782     $self->{ct}->{quirks} = 1;
2783     $self->{state} = BOGUS_DOCTYPE_STATE;
2784     } else {
2785     !!!cp (200.2);
2786     $self->{state} = BOGUS_MD_STATE;
2787     }
2788    
2789 wakaba 1.1 !!!next-input-character;
2790     redo A;
2791     }
2792     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2793     if ($is_space->{$self->{nc}}) {
2794     !!!cp (201);
2795     ## Stay in the state
2796     !!!next-input-character;
2797     redo A;
2798     } elsif ($self->{nc} == 0x0022) { # "
2799     !!!cp (202);
2800     $self->{ct}->{sysid} = ''; # DOCTYPE
2801     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2802     !!!next-input-character;
2803     redo A;
2804     } elsif ($self->{nc} == 0x0027) { # '
2805     !!!cp (203);
2806     $self->{ct}->{sysid} = ''; # DOCTYPE
2807     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2808     !!!next-input-character;
2809     redo A;
2810     } elsif ($self->{nc} == 0x003E) { # >
2811     !!!parse-error (type => 'no SYSTEM literal');
2812     !!!next-input-character;
2813    
2814 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815     !!!cp (204);
2816     $self->{state} = DATA_STATE;
2817     $self->{s_kwd} = '';
2818     $self->{ct}->{quirks} = 1;
2819     } else {
2820     !!!cp (204.1);
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     }
2823 wakaba 1.1
2824 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2825 wakaba 1.1 redo A;
2826     } elsif ($self->{nc} == -1) {
2827 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2828     !!!cp (205);
2829     !!!parse-error (type => 'unclosed DOCTYPE');
2830     $self->{state} = DATA_STATE;
2831     $self->{s_kwd} = '';
2832     $self->{ct}->{quirks} = 1;
2833     } else {
2834     !!!cp (205.1);
2835     !!!parse-error (type => 'unclosed md'); ## TODO: type
2836     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2837     }
2838    
2839 wakaba 1.1 ## reconsume
2840 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2841 wakaba 1.1 redo A;
2842 wakaba 1.16 } elsif ($self->{is_xml} and
2843     $self->{ct}->{type} == DOCTYPE_TOKEN and
2844     $self->{nc} == 0x005B) { # [
2845 wakaba 1.12 !!!cp (206.1);
2846     !!!parse-error (type => 'no SYSTEM literal');
2847    
2848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2849     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2850 wakaba 1.13 $self->{in_subset} = 1;
2851 wakaba 1.12 !!!next-input-character;
2852 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2853 wakaba 1.12 redo A;
2854 wakaba 1.1 } else {
2855     !!!parse-error (type => 'string after SYSTEM');
2856    
2857 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2858     !!!cp (206);
2859     $self->{ct}->{quirks} = 1;
2860     $self->{state} = BOGUS_DOCTYPE_STATE;
2861     } else {
2862     !!!cp (206.2);
2863     $self->{state} = BOGUS_MD_STATE;
2864     }
2865    
2866 wakaba 1.1 !!!next-input-character;
2867     redo A;
2868     }
2869     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2870     if ($self->{nc} == 0x0022) { # "
2871     !!!cp (207);
2872     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2873     !!!next-input-character;
2874     redo A;
2875 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2876 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2877    
2878 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879     !!!cp (208);
2880     $self->{state} = DATA_STATE;
2881     $self->{s_kwd} = '';
2882     $self->{ct}->{quirks} = 1;
2883     } else {
2884     !!!cp (208.1);
2885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2886     }
2887    
2888 wakaba 1.1 !!!next-input-character;
2889 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2890 wakaba 1.1 redo A;
2891     } elsif ($self->{nc} == -1) {
2892     !!!parse-error (type => 'unclosed SYSTEM literal');
2893    
2894 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2895     !!!cp (209);
2896     $self->{state} = DATA_STATE;
2897     $self->{s_kwd} = '';
2898     $self->{ct}->{quirks} = 1;
2899     } else {
2900     !!!cp (209.1);
2901     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902     }
2903    
2904 wakaba 1.1 ## reconsume
2905 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2906 wakaba 1.1 redo A;
2907     } else {
2908     !!!cp (210);
2909 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2910 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2911     length $self->{ct}->{sysid});
2912    
2913     ## Stay in the state
2914     !!!next-input-character;
2915     redo A;
2916     }
2917     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2918     if ($self->{nc} == 0x0027) { # '
2919     !!!cp (211);
2920     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2921     !!!next-input-character;
2922     redo A;
2923 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2924 wakaba 1.1 !!!cp (212);
2925     !!!parse-error (type => 'unclosed SYSTEM literal');
2926    
2927     $self->{state} = DATA_STATE;
2928 wakaba 1.5 $self->{s_kwd} = '';
2929 wakaba 1.1 !!!next-input-character;
2930    
2931     $self->{ct}->{quirks} = 1;
2932     !!!emit ($self->{ct}); # DOCTYPE
2933    
2934     redo A;
2935     } elsif ($self->{nc} == -1) {
2936     !!!parse-error (type => 'unclosed SYSTEM literal');
2937    
2938 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2939     !!!cp (213);
2940     $self->{state} = DATA_STATE;
2941     $self->{s_kwd} = '';
2942     $self->{ct}->{quirks} = 1;
2943     } else {
2944     !!!cp (213.1);
2945     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2946     }
2947    
2948 wakaba 1.1 ## reconsume
2949 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2950 wakaba 1.1 redo A;
2951     } else {
2952     !!!cp (214);
2953 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2954 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2955     length $self->{ct}->{sysid});
2956    
2957     ## Stay in the state
2958     !!!next-input-character;
2959     redo A;
2960     }
2961     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2962     if ($is_space->{$self->{nc}}) {
2963 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2964     !!!cp (215.1);
2965     $self->{state} = BEFORE_NDATA_STATE;
2966     } else {
2967     !!!cp (215);
2968     ## Stay in the state
2969     }
2970 wakaba 1.1 !!!next-input-character;
2971     redo A;
2972     } elsif ($self->{nc} == 0x003E) { # >
2973 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2974     !!!cp (216);
2975     $self->{state} = DATA_STATE;
2976     $self->{s_kwd} = '';
2977     } else {
2978     !!!cp (216.1);
2979     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2980     }
2981    
2982 wakaba 1.1 !!!next-input-character;
2983 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2984 wakaba 1.1 redo A;
2985 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2986     ($self->{nc} == 0x004E or # N
2987     $self->{nc} == 0x006E)) { # n
2988     !!!cp (216.2);
2989     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2990     $self->{state} = NDATA_STATE;
2991     $self->{kwd} = chr $self->{nc};
2992     !!!next-input-character;
2993     redo A;
2994 wakaba 1.1 } elsif ($self->{nc} == -1) {
2995 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2996     !!!cp (217);
2997     !!!parse-error (type => 'unclosed DOCTYPE');
2998     $self->{state} = DATA_STATE;
2999     $self->{s_kwd} = '';
3000     $self->{ct}->{quirks} = 1;
3001     } else {
3002     !!!cp (217.1);
3003     !!!parse-error (type => 'unclosed md'); ## TODO: type
3004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3005     }
3006    
3007 wakaba 1.1 ## reconsume
3008 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3009 wakaba 1.1 redo A;
3010 wakaba 1.16 } elsif ($self->{is_xml} and
3011     $self->{ct}->{type} == DOCTYPE_TOKEN and
3012     $self->{nc} == 0x005B) { # [
3013 wakaba 1.12 !!!cp (218.1);
3014     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3015     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3016 wakaba 1.13 $self->{in_subset} = 1;
3017 wakaba 1.12 !!!next-input-character;
3018 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
3019 wakaba 1.12 redo A;
3020 wakaba 1.1 } else {
3021     !!!parse-error (type => 'string after SYSTEM literal');
3022    
3023 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3024     !!!cp (218);
3025     #$self->{ct}->{quirks} = 1;
3026     $self->{state} = BOGUS_DOCTYPE_STATE;
3027     } else {
3028     !!!cp (218.2);
3029     $self->{state} = BOGUS_MD_STATE;
3030     }
3031    
3032 wakaba 1.1 !!!next-input-character;
3033     redo A;
3034     }
3035 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
3036     if ($is_space->{$self->{nc}}) {
3037     !!!cp (218.3);
3038     ## Stay in the state.
3039     !!!next-input-character;
3040     redo A;
3041     } elsif ($self->{nc} == 0x003E) { # >
3042     !!!cp (218.4);
3043     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3044     !!!next-input-character;
3045     !!!emit ($self->{ct}); # ENTITY
3046     redo A;
3047     } elsif ($self->{nc} == 0x004E or # N
3048     $self->{nc} == 0x006E) { # n
3049     !!!cp (218.5);
3050     $self->{state} = NDATA_STATE;
3051     $self->{kwd} = chr $self->{nc};
3052     !!!next-input-character;
3053     redo A;
3054     } elsif ($self->{nc} == -1) {
3055     !!!cp (218.6);
3056     !!!parse-error (type => 'unclosed md'); ## TODO: type
3057     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3058     ## reconsume
3059     !!!emit ($self->{ct}); # ENTITY
3060     redo A;
3061     } else {
3062     !!!cp (218.7);
3063     !!!parse-error (type => 'string after SYSTEM literal');
3064     $self->{state} = BOGUS_MD_STATE;
3065     !!!next-input-character;
3066     redo A;
3067     }
3068 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3069     if ($self->{nc} == 0x003E) { # >
3070     !!!cp (219);
3071     $self->{state} = DATA_STATE;
3072 wakaba 1.5 $self->{s_kwd} = '';
3073 wakaba 1.1 !!!next-input-character;
3074    
3075     !!!emit ($self->{ct}); # DOCTYPE
3076    
3077     redo A;
3078 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3079 wakaba 1.13 !!!cp (220.1);
3080     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3081     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3082     $self->{in_subset} = 1;
3083     !!!next-input-character;
3084     !!!emit ($self->{ct}); # DOCTYPE
3085     redo A;
3086 wakaba 1.1 } elsif ($self->{nc} == -1) {
3087     !!!cp (220);
3088     $self->{state} = DATA_STATE;
3089 wakaba 1.5 $self->{s_kwd} = '';
3090 wakaba 1.1 ## reconsume
3091    
3092     !!!emit ($self->{ct}); # DOCTYPE
3093    
3094     redo A;
3095     } else {
3096     !!!cp (221);
3097     my $s = '';
3098 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3099 wakaba 1.1
3100     ## Stay in the state
3101     !!!next-input-character;
3102     redo A;
3103     }
3104     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3105     ## NOTE: "CDATA section state" in the state is jointly implemented
3106     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3107     ## and |CDATA_SECTION_MSE2_STATE|.
3108 wakaba 1.10
3109     ## XML5: "CDATA state".
3110 wakaba 1.1
3111     if ($self->{nc} == 0x005D) { # ]
3112     !!!cp (221.1);
3113     $self->{state} = CDATA_SECTION_MSE1_STATE;
3114     !!!next-input-character;
3115     redo A;
3116     } elsif ($self->{nc} == -1) {
3117 wakaba 1.6 if ($self->{is_xml}) {
3118 wakaba 1.8 !!!cp (221.11);
3119 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3120 wakaba 1.8 } else {
3121     !!!cp (221.12);
3122 wakaba 1.6 }
3123    
3124 wakaba 1.1 $self->{state} = DATA_STATE;
3125 wakaba 1.5 $self->{s_kwd} = '';
3126 wakaba 1.10 ## Reconsume.
3127 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3128     !!!cp (221.2);
3129     !!!emit ($self->{ct}); # character
3130     } else {
3131     !!!cp (221.3);
3132     ## No token to emit. $self->{ct} is discarded.
3133     }
3134     redo A;
3135     } else {
3136     !!!cp (221.4);
3137     $self->{ct}->{data} .= chr $self->{nc};
3138     $self->{read_until}->($self->{ct}->{data},
3139     q<]>,
3140     length $self->{ct}->{data});
3141    
3142     ## Stay in the state.
3143     !!!next-input-character;
3144     redo A;
3145     }
3146    
3147     ## ISSUE: "text tokens" in spec.
3148     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3149 wakaba 1.10 ## XML5: "CDATA bracket state".
3150    
3151 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3152     !!!cp (221.5);
3153     $self->{state} = CDATA_SECTION_MSE2_STATE;
3154     !!!next-input-character;
3155     redo A;
3156     } else {
3157     !!!cp (221.6);
3158 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3159 wakaba 1.1 $self->{ct}->{data} .= ']';
3160 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3161 wakaba 1.1 ## Reconsume.
3162     redo A;
3163     }
3164     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3165 wakaba 1.10 ## XML5: "CDATA end state".
3166    
3167 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3168     $self->{state} = DATA_STATE;
3169 wakaba 1.5 $self->{s_kwd} = '';
3170 wakaba 1.1 !!!next-input-character;
3171     if (length $self->{ct}->{data}) { # character
3172     !!!cp (221.7);
3173     !!!emit ($self->{ct}); # character
3174     } else {
3175     !!!cp (221.8);
3176     ## No token to emit. $self->{ct} is discarded.
3177     }
3178     redo A;
3179     } elsif ($self->{nc} == 0x005D) { # ]
3180     !!!cp (221.9); # character
3181     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3182     ## Stay in the state.
3183     !!!next-input-character;
3184     redo A;
3185     } else {
3186     !!!cp (221.11);
3187     $self->{ct}->{data} .= ']]'; # character
3188     $self->{state} = CDATA_SECTION_STATE;
3189 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3190 wakaba 1.1 redo A;
3191     }
3192     } elsif ($self->{state} == ENTITY_STATE) {
3193     if ($is_space->{$self->{nc}} or
3194     {
3195     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3196     $self->{entity_add} => 1,
3197     }->{$self->{nc}}) {
3198 wakaba 1.22 if ($self->{is_xml}) {
3199     !!!cp (1001.1);
3200     !!!parse-error (type => 'bare ero',
3201     line => $self->{line_prev},
3202     column => $self->{column_prev}
3203     + ($self->{nc} == -1 ? 1 : 0));
3204     } else {
3205     !!!cp (1001);
3206     ## No error
3207     }
3208 wakaba 1.1 ## Don't consume
3209     ## Return nothing.
3210     #
3211     } elsif ($self->{nc} == 0x0023) { # #
3212     !!!cp (999);
3213     $self->{state} = ENTITY_HASH_STATE;
3214 wakaba 1.12 $self->{kwd} = '#';
3215 wakaba 1.1 !!!next-input-character;
3216     redo A;
3217 wakaba 1.22 } elsif ($self->{is_xml} or
3218     (0x0041 <= $self->{nc} and
3219 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3220     (0x0061 <= $self->{nc} and
3221     $self->{nc} <= 0x007A)) { # a..z
3222     !!!cp (998);
3223     require Whatpm::_NamedEntityList;
3224     $self->{state} = ENTITY_NAME_STATE;
3225 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3226     $self->{entity__value} = $self->{kwd};
3227 wakaba 1.1 $self->{entity__match} = 0;
3228     !!!next-input-character;
3229     redo A;
3230     } else {
3231     !!!cp (1027);
3232     !!!parse-error (type => 'bare ero');
3233     ## Return nothing.
3234     #
3235     }
3236    
3237     ## NOTE: No character is consumed by the "consume a character
3238     ## reference" algorithm. In other word, there is an "&" character
3239     ## that does not introduce a character reference, which would be
3240     ## appended to the parent element or the attribute value in later
3241     ## process of the tokenizer.
3242    
3243     if ($self->{prev_state} == DATA_STATE) {
3244     !!!cp (997);
3245     $self->{state} = $self->{prev_state};
3246 wakaba 1.5 $self->{s_kwd} = '';
3247 wakaba 1.1 ## Reconsume.
3248     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3249     line => $self->{line_prev},
3250     column => $self->{column_prev},
3251     });
3252     redo A;
3253     } else {
3254     !!!cp (996);
3255     $self->{ca}->{value} .= '&';
3256     $self->{state} = $self->{prev_state};
3257 wakaba 1.5 $self->{s_kwd} = '';
3258 wakaba 1.1 ## Reconsume.
3259     redo A;
3260     }
3261     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3262 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3263 wakaba 1.1 !!!cp (995);
3264     $self->{state} = HEXREF_X_STATE;
3265 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3266 wakaba 1.1 !!!next-input-character;
3267     redo A;
3268 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3269     !!!cp (995.1);
3270     if ($self->{is_xml}) {
3271     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3272     }
3273     $self->{state} = HEXREF_X_STATE;
3274     $self->{kwd} .= chr $self->{nc};
3275     !!!next-input-character;
3276     redo A;
3277 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3278     $self->{nc} <= 0x0039) { # 0..9
3279     !!!cp (994);
3280     $self->{state} = NCR_NUM_STATE;
3281 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3282 wakaba 1.1 !!!next-input-character;
3283     redo A;
3284     } else {
3285     !!!parse-error (type => 'bare nero',
3286     line => $self->{line_prev},
3287     column => $self->{column_prev} - 1);
3288    
3289     ## NOTE: According to the spec algorithm, nothing is returned,
3290     ## and then "&#" is appended to the parent element or the attribute
3291     ## value in the later processing.
3292    
3293     if ($self->{prev_state} == DATA_STATE) {
3294     !!!cp (1019);
3295     $self->{state} = $self->{prev_state};
3296 wakaba 1.5 $self->{s_kwd} = '';
3297 wakaba 1.1 ## Reconsume.
3298     !!!emit ({type => CHARACTER_TOKEN,
3299     data => '&#',
3300     line => $self->{line_prev},
3301     column => $self->{column_prev} - 1,
3302     });
3303     redo A;
3304     } else {
3305     !!!cp (993);
3306     $self->{ca}->{value} .= '&#';
3307     $self->{state} = $self->{prev_state};
3308 wakaba 1.5 $self->{s_kwd} = '';
3309 wakaba 1.1 ## Reconsume.
3310     redo A;
3311     }
3312     }
3313     } elsif ($self->{state} == NCR_NUM_STATE) {
3314     if (0x0030 <= $self->{nc} and
3315     $self->{nc} <= 0x0039) { # 0..9
3316     !!!cp (1012);
3317 wakaba 1.12 $self->{kwd} *= 10;
3318     $self->{kwd} += $self->{nc} - 0x0030;
3319 wakaba 1.1
3320     ## Stay in the state.
3321     !!!next-input-character;
3322     redo A;
3323     } elsif ($self->{nc} == 0x003B) { # ;
3324     !!!cp (1013);
3325     !!!next-input-character;
3326     #
3327     } else {
3328     !!!cp (1014);
3329     !!!parse-error (type => 'no refc');
3330     ## Reconsume.
3331     #
3332     }
3333    
3334 wakaba 1.12 my $code = $self->{kwd};
3335 wakaba 1.1 my $l = $self->{line_prev};
3336     my $c = $self->{column_prev};
3337 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3338     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3339     ($self->{is_xml} and $code == 0x0000)) {
3340 wakaba 1.1 !!!cp (1015);
3341     !!!parse-error (type => 'invalid character reference',
3342     text => (sprintf 'U+%04X', $code),
3343     line => $l, column => $c);
3344     $code = $charref_map->{$code};
3345     } elsif ($code > 0x10FFFF) {
3346     !!!cp (1016);
3347     !!!parse-error (type => 'invalid character reference',
3348     text => (sprintf 'U-%08X', $code),
3349     line => $l, column => $c);
3350     $code = 0xFFFD;
3351     }
3352    
3353     if ($self->{prev_state} == DATA_STATE) {
3354     !!!cp (992);
3355     $self->{state} = $self->{prev_state};
3356 wakaba 1.5 $self->{s_kwd} = '';
3357 wakaba 1.1 ## Reconsume.
3358     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3359 wakaba 1.7 has_reference => 1,
3360 wakaba 1.1 line => $l, column => $c,
3361     });
3362     redo A;
3363     } else {
3364     !!!cp (991);
3365     $self->{ca}->{value} .= chr $code;
3366     $self->{ca}->{has_reference} = 1;
3367     $self->{state} = $self->{prev_state};
3368 wakaba 1.5 $self->{s_kwd} = '';
3369 wakaba 1.1 ## Reconsume.
3370     redo A;
3371     }
3372     } elsif ($self->{state} == HEXREF_X_STATE) {
3373     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3374     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3375     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3376     # 0..9, A..F, a..f
3377     !!!cp (990);
3378     $self->{state} = HEXREF_HEX_STATE;
3379 wakaba 1.12 $self->{kwd} = 0;
3380 wakaba 1.1 ## Reconsume.
3381     redo A;
3382     } else {
3383     !!!parse-error (type => 'bare hcro',
3384     line => $self->{line_prev},
3385     column => $self->{column_prev} - 2);
3386    
3387     ## NOTE: According to the spec algorithm, nothing is returned,
3388     ## and then "&#" followed by "X" or "x" is appended to the parent
3389     ## element or the attribute value in the later processing.
3390    
3391     if ($self->{prev_state} == DATA_STATE) {
3392     !!!cp (1005);
3393     $self->{state} = $self->{prev_state};
3394 wakaba 1.5 $self->{s_kwd} = '';
3395 wakaba 1.1 ## Reconsume.
3396     !!!emit ({type => CHARACTER_TOKEN,
3397 wakaba 1.12 data => '&' . $self->{kwd},
3398 wakaba 1.1 line => $self->{line_prev},
3399 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3400 wakaba 1.1 });
3401     redo A;
3402     } else {
3403     !!!cp (989);
3404 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3405 wakaba 1.1 $self->{state} = $self->{prev_state};
3406 wakaba 1.5 $self->{s_kwd} = '';
3407 wakaba 1.1 ## Reconsume.
3408     redo A;
3409     }
3410     }
3411     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3412     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3413     # 0..9
3414     !!!cp (1002);
3415 wakaba 1.12 $self->{kwd} *= 0x10;
3416     $self->{kwd} += $self->{nc} - 0x0030;
3417 wakaba 1.1 ## Stay in the state.
3418     !!!next-input-character;
3419     redo A;
3420     } elsif (0x0061 <= $self->{nc} and
3421     $self->{nc} <= 0x0066) { # a..f
3422     !!!cp (1003);
3423 wakaba 1.12 $self->{kwd} *= 0x10;
3424     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3425 wakaba 1.1 ## Stay in the state.
3426     !!!next-input-character;
3427     redo A;
3428     } elsif (0x0041 <= $self->{nc} and
3429     $self->{nc} <= 0x0046) { # A..F
3430     !!!cp (1004);
3431 wakaba 1.12 $self->{kwd} *= 0x10;
3432     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3433 wakaba 1.1 ## Stay in the state.
3434     !!!next-input-character;
3435     redo A;
3436     } elsif ($self->{nc} == 0x003B) { # ;
3437     !!!cp (1006);
3438     !!!next-input-character;
3439     #
3440     } else {
3441     !!!cp (1007);
3442     !!!parse-error (type => 'no refc',
3443     line => $self->{line},
3444     column => $self->{column});
3445     ## Reconsume.
3446     #
3447     }
3448    
3449 wakaba 1.12 my $code = $self->{kwd};
3450 wakaba 1.1 my $l = $self->{line_prev};
3451     my $c = $self->{column_prev};
3452 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3453     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3454     ($self->{is_xml} and $code == 0x0000)) {
3455 wakaba 1.1 !!!cp (1008);
3456     !!!parse-error (type => 'invalid character reference',
3457     text => (sprintf 'U+%04X', $code),
3458     line => $l, column => $c);
3459     $code = $charref_map->{$code};
3460     } elsif ($code > 0x10FFFF) {
3461     !!!cp (1009);
3462     !!!parse-error (type => 'invalid character reference',
3463     text => (sprintf 'U-%08X', $code),
3464     line => $l, column => $c);
3465     $code = 0xFFFD;
3466     }
3467    
3468     if ($self->{prev_state} == DATA_STATE) {
3469     !!!cp (988);
3470     $self->{state} = $self->{prev_state};
3471 wakaba 1.5 $self->{s_kwd} = '';
3472 wakaba 1.1 ## Reconsume.
3473     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3474 wakaba 1.7 has_reference => 1,
3475 wakaba 1.1 line => $l, column => $c,
3476     });
3477     redo A;
3478     } else {
3479     !!!cp (987);
3480     $self->{ca}->{value} .= chr $code;
3481     $self->{ca}->{has_reference} = 1;
3482     $self->{state} = $self->{prev_state};
3483 wakaba 1.5 $self->{s_kwd} = '';
3484 wakaba 1.1 ## Reconsume.
3485     redo A;
3486     }
3487     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3488 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3489     $self->{nc} <= 0x005A) or # x
3490     (0x0061 <= $self->{nc} and # a
3491     $self->{nc} <= 0x007A) or # z
3492     (0x0030 <= $self->{nc} and # 0
3493     $self->{nc} <= 0x0039) or # 9
3494 wakaba 1.22 $self->{nc} == 0x003B or # ;
3495     ($self->{is_xml} and
3496     not ($is_space->{$self->{nc}} or
3497     {
3498     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3499     $self->{entity_add} => 1,
3500     }->{$self->{nc}}))) {
3501 wakaba 1.1 our $EntityChar;
3502 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3503 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3504     $self->{ge}->{$self->{kwd}}) {
3505 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3506 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3507     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3508     !!!cp (1020.1);
3509     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3510     } else {
3511     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3512     !!!cp (1020.2);
3513     !!!parse-error (type => 'unparsed entity', ## TODO: type
3514     value => $self->{kwd});
3515     } else {
3516     !!!cp (1020.3);
3517     }
3518     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3519     }
3520     } else {
3521     if ($self->{is_xml}) {
3522     !!!cp (1020.4);
3523     !!!parse-error (type => 'entity not declared', ## TODO: type
3524     value => $self->{kwd},
3525     level => {
3526     'amp;' => $self->{level}->{warn},
3527     'quot;' => $self->{level}->{warn},
3528     'lt;' => $self->{level}->{warn},
3529     'gt;' => $self->{level}->{warn},
3530     'apos;' => $self->{level}->{warn},
3531     }->{$self->{kwd}} ||
3532     $self->{level}->{must});
3533     } else {
3534     !!!cp (1020);
3535     }
3536     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3537     }
3538 wakaba 1.1 $self->{entity__match} = 1;
3539     !!!next-input-character;
3540     #
3541     } else {
3542     !!!cp (1021);
3543 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3544 wakaba 1.1 $self->{entity__match} = -1;
3545     ## Stay in the state.
3546     !!!next-input-character;
3547     redo A;
3548     }
3549     } else {
3550     !!!cp (1022);
3551     $self->{entity__value} .= chr $self->{nc};
3552     $self->{entity__match} *= 2;
3553     ## Stay in the state.
3554     !!!next-input-character;
3555     redo A;
3556     }
3557     }
3558    
3559     my $data;
3560     my $has_ref;
3561     if ($self->{entity__match} > 0) {
3562     !!!cp (1023);
3563     $data = $self->{entity__value};
3564     $has_ref = 1;
3565     #
3566     } elsif ($self->{entity__match} < 0) {
3567     !!!parse-error (type => 'no refc');
3568     if ($self->{prev_state} != DATA_STATE and # in attribute
3569     $self->{entity__match} < -1) {
3570     !!!cp (1024);
3571 wakaba 1.12 $data = '&' . $self->{kwd};
3572 wakaba 1.1 #
3573     } else {
3574     !!!cp (1025);
3575     $data = $self->{entity__value};
3576     $has_ref = 1;
3577     #
3578     }
3579     } else {
3580     !!!cp (1026);
3581     !!!parse-error (type => 'bare ero',
3582     line => $self->{line_prev},
3583 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3584     $data = '&' . $self->{kwd};
3585 wakaba 1.1 #
3586     }
3587    
3588     ## NOTE: In these cases, when a character reference is found,
3589     ## it is consumed and a character token is returned, or, otherwise,
3590     ## nothing is consumed and returned, according to the spec algorithm.
3591     ## In this implementation, anything that has been examined by the
3592     ## tokenizer is appended to the parent element or the attribute value
3593     ## as string, either literal string when no character reference or
3594     ## entity-replaced string otherwise, in this stage, since any characters
3595     ## that would not be consumed are appended in the data state or in an
3596     ## appropriate attribute value state anyway.
3597    
3598     if ($self->{prev_state} == DATA_STATE) {
3599     !!!cp (986);
3600     $self->{state} = $self->{prev_state};
3601 wakaba 1.5 $self->{s_kwd} = '';
3602 wakaba 1.1 ## Reconsume.
3603     !!!emit ({type => CHARACTER_TOKEN,
3604     data => $data,
3605 wakaba 1.7 has_reference => $has_ref,
3606 wakaba 1.1 line => $self->{line_prev},
3607 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3608 wakaba 1.1 });
3609     redo A;
3610     } else {
3611     !!!cp (985);
3612     $self->{ca}->{value} .= $data;
3613     $self->{ca}->{has_reference} = 1 if $has_ref;
3614     $self->{state} = $self->{prev_state};
3615 wakaba 1.5 $self->{s_kwd} = '';
3616 wakaba 1.1 ## Reconsume.
3617     redo A;
3618     }
3619 wakaba 1.8
3620     ## XML-only states
3621    
3622     } elsif ($self->{state} == PI_STATE) {
3623 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3624    
3625 wakaba 1.8 if ($is_space->{$self->{nc}} or
3626 wakaba 1.14 $self->{nc} == 0x003F or # ?
3627 wakaba 1.8 $self->{nc} == -1) {
3628 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3629     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3630     ## "DOCTYPE pi state": Parse error, switch to the "data
3631     ## state".
3632 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3633     line => $self->{line_prev},
3634     column => $self->{column_prev}
3635     - 1 * ($self->{nc} != -1));
3636     $self->{state} = BOGUS_COMMENT_STATE;
3637     ## Reconsume.
3638     $self->{ct} = {type => COMMENT_TOKEN,
3639     data => '?',
3640     line => $self->{line_prev},
3641     column => $self->{column_prev}
3642     - 1 * ($self->{nc} != -1),
3643     };
3644     redo A;
3645     } else {
3646 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3647 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3648     target => chr $self->{nc},
3649     data => '',
3650     line => $self->{line_prev},
3651     column => $self->{column_prev} - 1,
3652     };
3653     $self->{state} = PI_TARGET_STATE;
3654     !!!next-input-character;
3655     redo A;
3656     }
3657     } elsif ($self->{state} == PI_TARGET_STATE) {
3658     if ($is_space->{$self->{nc}}) {
3659     $self->{state} = PI_TARGET_AFTER_STATE;
3660     !!!next-input-character;
3661     redo A;
3662     } elsif ($self->{nc} == -1) {
3663     !!!parse-error (type => 'no pic'); ## TODO: type
3664 wakaba 1.13 if ($self->{in_subset}) {
3665     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3666     } else {
3667     $self->{state} = DATA_STATE;
3668     $self->{s_kwd} = '';
3669     }
3670 wakaba 1.8 ## Reconsume.
3671     !!!emit ($self->{ct}); # pi
3672     redo A;
3673     } elsif ($self->{nc} == 0x003F) { # ?
3674     $self->{state} = PI_AFTER_STATE;
3675     !!!next-input-character;
3676     redo A;
3677     } else {
3678     ## XML5: typo ("tag name" -> "target")
3679     $self->{ct}->{target} .= chr $self->{nc}; # pi
3680     !!!next-input-character;
3681     redo A;
3682     }
3683     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3684     if ($is_space->{$self->{nc}}) {
3685     ## Stay in the state.
3686     !!!next-input-character;
3687     redo A;
3688     } else {
3689     $self->{state} = PI_DATA_STATE;
3690     ## Reprocess.
3691     redo A;
3692     }
3693     } elsif ($self->{state} == PI_DATA_STATE) {
3694     if ($self->{nc} == 0x003F) { # ?
3695     $self->{state} = PI_DATA_AFTER_STATE;
3696     !!!next-input-character;
3697     redo A;
3698     } elsif ($self->{nc} == -1) {
3699     !!!parse-error (type => 'no pic'); ## TODO: type
3700 wakaba 1.13 if ($self->{in_subset}) {
3701 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3702 wakaba 1.13 } else {
3703     $self->{state} = DATA_STATE;
3704     $self->{s_kwd} = '';
3705     }
3706 wakaba 1.8 ## Reprocess.
3707     !!!emit ($self->{ct}); # pi
3708     redo A;
3709     } else {
3710     $self->{ct}->{data} .= chr $self->{nc}; # pi
3711     $self->{read_until}->($self->{ct}->{data}, q[?],
3712     length $self->{ct}->{data});
3713     ## Stay in the state.
3714     !!!next-input-character;
3715     ## Reprocess.
3716     redo A;
3717     }
3718     } elsif ($self->{state} == PI_AFTER_STATE) {
3719 wakaba 1.14 ## XML5: Part of "Pi after state".
3720    
3721 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3722 wakaba 1.13 if ($self->{in_subset}) {
3723     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3724     } else {
3725     $self->{state} = DATA_STATE;
3726     $self->{s_kwd} = '';
3727     }
3728 wakaba 1.8 !!!next-input-character;
3729     !!!emit ($self->{ct}); # pi
3730     redo A;
3731     } elsif ($self->{nc} == 0x003F) { # ?
3732     !!!parse-error (type => 'no s after target', ## TODO: type
3733     line => $self->{line_prev},
3734     column => $self->{column_prev}); ## XML5: no error
3735     $self->{ct}->{data} .= '?';
3736     $self->{state} = PI_DATA_AFTER_STATE;
3737     !!!next-input-character;
3738     redo A;
3739     } else {
3740     !!!parse-error (type => 'no s after target', ## TODO: type
3741     line => $self->{line_prev},
3742     column => $self->{column_prev}
3743     + 1 * ($self->{nc} == -1)); ## XML5: no error
3744     $self->{ct}->{data} .= '?'; ## XML5: not appended
3745     $self->{state} = PI_DATA_STATE;
3746     ## Reprocess.
3747     redo A;
3748     }
3749     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3750 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3751    
3752 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3753 wakaba 1.13 if ($self->{in_subset}) {
3754     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755     } else {
3756     $self->{state} = DATA_STATE;
3757     $self->{s_kwd} = '';
3758     }
3759 wakaba 1.8 !!!next-input-character;
3760     !!!emit ($self->{ct}); # pi
3761     redo A;
3762     } elsif ($self->{nc} == 0x003F) { # ?
3763     $self->{ct}->{data} .= '?';
3764     ## Stay in the state.
3765     !!!next-input-character;
3766     redo A;
3767     } else {
3768     $self->{ct}->{data} .= '?'; ## XML5: not appended
3769     $self->{state} = PI_DATA_STATE;
3770     ## Reprocess.
3771     redo A;
3772     }
3773 wakaba 1.12
3774     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3775     if ($self->{nc} == 0x003C) { # <
3776 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3777 wakaba 1.12 !!!next-input-character;
3778     redo A;
3779     } elsif ($self->{nc} == 0x0025) { # %
3780     ## XML5: Not defined yet.
3781    
3782     ## TODO:
3783 wakaba 1.24
3784     if (not $self->{stop_processing} and
3785     not $self->{document}->xml_standalone) {
3786     !!!parse-error (type => 'stop processing', ## TODO: type
3787     level => $self->{level}->{info});
3788     $self->{stop_processing} = 1;
3789     }
3790    
3791 wakaba 1.12 !!!next-input-character;
3792     redo A;
3793     } elsif ($self->{nc} == 0x005D) { # ]
3794 wakaba 1.13 delete $self->{in_subset};
3795 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3796     !!!next-input-character;
3797     redo A;
3798     } elsif ($is_space->{$self->{nc}}) {
3799     ## Stay in the state.
3800     !!!next-input-character;
3801     redo A;
3802     } elsif ($self->{nc} == -1) {
3803     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3804 wakaba 1.13 delete $self->{in_subset};
3805 wakaba 1.12 $self->{state} = DATA_STATE;
3806     $self->{s_kwd} = '';
3807     ## Reconsume.
3808 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3809 wakaba 1.12 redo A;
3810     } else {
3811     unless ($self->{internal_subset_tainted}) {
3812     ## XML5: No parse error.
3813     !!!parse-error (type => 'string in internal subset');
3814     $self->{internal_subset_tainted} = 1;
3815     }
3816     ## Stay in the state.
3817     !!!next-input-character;
3818     redo A;
3819     }
3820     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3821     if ($self->{nc} == 0x003E) { # >
3822     $self->{state} = DATA_STATE;
3823     $self->{s_kwd} = '';
3824     !!!next-input-character;
3825 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3826 wakaba 1.12 redo A;
3827     } elsif ($self->{nc} == -1) {
3828     !!!parse-error (type => 'unclosed DOCTYPE');
3829     $self->{state} = DATA_STATE;
3830     $self->{s_kwd} = '';
3831     ## Reconsume.
3832 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3833 wakaba 1.12 redo A;
3834     } else {
3835     ## XML5: No parse error and stay in the state.
3836     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3837    
3838 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3839     !!!next-input-character;
3840     redo A;
3841     }
3842     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3843     if ($self->{nc} == 0x003E) { # >
3844     $self->{state} = DATA_STATE;
3845     $self->{s_kwd} = '';
3846     !!!next-input-character;
3847     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3848     redo A;
3849     } elsif ($self->{nc} == -1) {
3850     $self->{state} = DATA_STATE;
3851     $self->{s_kwd} = '';
3852     ## Reconsume.
3853     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3854     redo A;
3855     } else {
3856     ## Stay in the state.
3857     !!!next-input-character;
3858     redo A;
3859     }
3860     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3861     if ($self->{nc} == 0x0021) { # !
3862 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3863 wakaba 1.13 !!!next-input-character;
3864     redo A;
3865     } elsif ($self->{nc} == 0x003F) { # ?
3866     $self->{state} = PI_STATE;
3867     !!!next-input-character;
3868     redo A;
3869     } elsif ($self->{nc} == -1) {
3870     !!!parse-error (type => 'bare stago');
3871     $self->{state} = DATA_STATE;
3872     $self->{s_kwd} = '';
3873     ## Reconsume.
3874     redo A;
3875     } else {
3876     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3877     line => $self->{line_prev},
3878     column => $self->{column_prev});
3879     $self->{state} = BOGUS_COMMENT_STATE;
3880     $self->{ct} = {type => COMMENT_TOKEN,
3881     data => '',
3882     }; ## NOTE: Will be discarded.
3883 wakaba 1.12 !!!next-input-character;
3884     redo A;
3885     }
3886 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3887     ## XML5: "DOCTYPE markup declaration state".
3888    
3889     if ($self->{nc} == 0x002D) { # -
3890     $self->{state} = MD_HYPHEN_STATE;
3891     !!!next-input-character;
3892     redo A;
3893 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3894     $self->{nc} == 0x0065) { # e
3895 wakaba 1.14 $self->{state} = MD_E_STATE;
3896     $self->{kwd} = chr $self->{nc};
3897     !!!next-input-character;
3898     redo A;
3899 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3900     $self->{nc} == 0x0061) { # a
3901 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3902     $self->{kwd} = chr $self->{nc};
3903     !!!next-input-character;
3904     redo A;
3905 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3906     $self->{nc} == 0x006E) { # n
3907 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3908     $self->{kwd} = chr $self->{nc};
3909     !!!next-input-character;
3910     redo A;
3911     } else {
3912     #
3913     }
3914    
3915     ## XML5: No parse error.
3916     !!!parse-error (type => 'bogus comment',
3917     line => $self->{line_prev},
3918     column => $self->{column_prev} - 1);
3919     ## Reconsume.
3920     $self->{state} = BOGUS_COMMENT_STATE;
3921     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3922     redo A;
3923     } elsif ($self->{state} == MD_E_STATE) {
3924 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3925     $self->{nc} == 0x006E) { # n
3926 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3927     $self->{kwd} .= chr $self->{nc};
3928     !!!next-input-character;
3929     redo A;
3930 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3931     $self->{nc} == 0x006C) { # l
3932 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3933     $self->{state} = MD_ELEMENT_STATE;
3934     $self->{kwd} .= chr $self->{nc};
3935     !!!next-input-character;
3936     redo A;
3937     } else {
3938     ## XML5: No parse error.
3939     !!!parse-error (type => 'bogus comment',
3940     line => $self->{line_prev},
3941     column => $self->{column_prev} - 2
3942     + 1 * ($self->{nc} == -1));
3943     ## Reconsume.
3944     $self->{state} = BOGUS_COMMENT_STATE;
3945     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3946     redo A;
3947     }
3948     } elsif ($self->{state} == MD_ENTITY_STATE) {
3949 wakaba 1.17 if ($self->{nc} == [
3950     undef,
3951     undef,
3952     0x0054, # T
3953     0x0049, # I
3954     0x0054, # T
3955     ]->[length $self->{kwd}] or
3956     $self->{nc} == [
3957     undef,
3958     undef,
3959     0x0074, # t
3960     0x0069, # i
3961     0x0074, # t
3962     ]->[length $self->{kwd}]) {
3963 wakaba 1.14 ## Stay in the state.
3964     $self->{kwd} .= chr $self->{nc};
3965     !!!next-input-character;
3966     redo A;
3967 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3968     ($self->{nc} == 0x0059 or # Y
3969     $self->{nc} == 0x0079)) { # y
3970     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3971     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3972     text => 'ENTITY',
3973     line => $self->{line_prev},
3974     column => $self->{column_prev} - 4);
3975     }
3976     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3977 wakaba 1.14 line => $self->{line_prev},
3978     column => $self->{column_prev} - 6};
3979     $self->{state} = DOCTYPE_MD_STATE;
3980     !!!next-input-character;
3981     redo A;
3982     } else {
3983     !!!parse-error (type => 'bogus comment',
3984     line => $self->{line_prev},
3985     column => $self->{column_prev} - 1
3986     - (length $self->{kwd})
3987     + 1 * ($self->{nc} == -1));
3988     $self->{state} = BOGUS_COMMENT_STATE;
3989     ## Reconsume.
3990     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3991     redo A;
3992     }
3993     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3994 wakaba 1.17 if ($self->{nc} == [
3995     undef,
3996     undef,
3997     0x0045, # E
3998     0x004D, # M
3999     0x0045, # E
4000     0x004E, # N
4001     ]->[length $self->{kwd}] or
4002     $self->{nc} == [
4003     undef,
4004     undef,
4005     0x0065, # e
4006     0x006D, # m
4007     0x0065, # e
4008     0x006E, # n
4009     ]->[length $self->{kwd}]) {
4010 wakaba 1.14 ## Stay in the state.
4011     $self->{kwd} .= chr $self->{nc};
4012     !!!next-input-character;
4013     redo A;
4014 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4015     ($self->{nc} == 0x0054 or # T
4016     $self->{nc} == 0x0074)) { # t
4017     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
4018     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4019     text => 'ELEMENT',
4020     line => $self->{line_prev},
4021     column => $self->{column_prev} - 5);
4022     }
4023 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
4024     line => $self->{line_prev},
4025 wakaba 1.23 column => $self->{column_prev} - 7};
4026 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4027     !!!next-input-character;
4028     redo A;
4029     } else {
4030     !!!parse-error (type => 'bogus comment',
4031     line => $self->{line_prev},
4032     column => $self->{column_prev} - 1
4033     - (length $self->{kwd})
4034     + 1 * ($self->{nc} == -1));
4035     $self->{state} = BOGUS_COMMENT_STATE;
4036     ## Reconsume.
4037     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4038     redo A;
4039     }
4040     } elsif ($self->{state} == MD_ATTLIST_STATE) {
4041 wakaba 1.17 if ($self->{nc} == [
4042     undef,
4043     0x0054, # T
4044     0x0054, # T
4045     0x004C, # L
4046     0x0049, # I
4047     0x0053, # S
4048     ]->[length $self->{kwd}] or
4049     $self->{nc} == [
4050     undef,
4051     0x0074, # t
4052     0x0074, # t
4053     0x006C, # l
4054     0x0069, # i
4055     0x0073, # s
4056     ]->[length $self->{kwd}]) {
4057 wakaba 1.14 ## Stay in the state.
4058     $self->{kwd} .= chr $self->{nc};
4059     !!!next-input-character;
4060     redo A;
4061 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4062     ($self->{nc} == 0x0054 or # T
4063     $self->{nc} == 0x0074)) { # t
4064     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4065     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4066     text => 'ATTLIST',
4067     line => $self->{line_prev},
4068     column => $self->{column_prev} - 5);
4069     }
4070 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4071 wakaba 1.15 attrdefs => [],
4072 wakaba 1.14 line => $self->{line_prev},
4073 wakaba 1.23 column => $self->{column_prev} - 7};
4074 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4075     !!!next-input-character;
4076     redo A;
4077     } else {
4078     !!!parse-error (type => 'bogus comment',
4079     line => $self->{line_prev},
4080     column => $self->{column_prev} - 1
4081     - (length $self->{kwd})
4082     + 1 * ($self->{nc} == -1));
4083     $self->{state} = BOGUS_COMMENT_STATE;
4084     ## Reconsume.
4085     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4086     redo A;
4087     }
4088     } elsif ($self->{state} == MD_NOTATION_STATE) {
4089 wakaba 1.17 if ($self->{nc} == [
4090     undef,
4091     0x004F, # O
4092     0x0054, # T
4093     0x0041, # A
4094     0x0054, # T
4095     0x0049, # I
4096     0x004F, # O
4097     ]->[length $self->{kwd}] or
4098     $self->{nc} == [
4099     undef,
4100     0x006F, # o
4101     0x0074, # t
4102     0x0061, # a
4103     0x0074, # t
4104     0x0069, # i
4105     0x006F, # o
4106     ]->[length $self->{kwd}]) {
4107 wakaba 1.14 ## Stay in the state.
4108     $self->{kwd} .= chr $self->{nc};
4109     !!!next-input-character;
4110     redo A;
4111 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4112     ($self->{nc} == 0x004E or # N
4113     $self->{nc} == 0x006E)) { # n
4114     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4115     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4116     text => 'NOTATION',
4117     line => $self->{line_prev},
4118     column => $self->{column_prev} - 6);
4119     }
4120 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4121     line => $self->{line_prev},
4122 wakaba 1.23 column => $self->{column_prev} - 8};
4123 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4124     !!!next-input-character;
4125     redo A;
4126     } else {
4127     !!!parse-error (type => 'bogus comment',
4128     line => $self->{line_prev},
4129     column => $self->{column_prev} - 1
4130     - (length $self->{kwd})
4131     + 1 * ($self->{nc} == -1));
4132     $self->{state} = BOGUS_COMMENT_STATE;
4133     ## Reconsume.
4134     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4135     redo A;
4136     }
4137     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4138     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4139     ## "DOCTYPE NOTATION state".
4140    
4141     if ($is_space->{$self->{nc}}) {
4142     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4143     $self->{state} = BEFORE_MD_NAME_STATE;
4144     !!!next-input-character;
4145     redo A;
4146     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4147     $self->{nc} == 0x0025) { # %
4148     ## XML5: Switch to the "DOCTYPE bogus comment state".
4149     !!!parse-error (type => 'no space before md name'); ## TODO: type
4150     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4151     !!!next-input-character;
4152     redo A;
4153     } elsif ($self->{nc} == -1) {
4154     !!!parse-error (type => 'unclosed md'); ## TODO: type
4155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4156     ## Reconsume.
4157     redo A;
4158     } elsif ($self->{nc} == 0x003E) { # >
4159     ## XML5: Switch to the "DOCTYPE bogus comment state".
4160     !!!parse-error (type => 'no md name'); ## TODO: type
4161     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4162     !!!next-input-character;
4163     redo A;
4164     } else {
4165     ## XML5: Switch to the "DOCTYPE bogus comment state".
4166     !!!parse-error (type => 'no space before md name'); ## TODO: type
4167     $self->{state} = BEFORE_MD_NAME_STATE;
4168     redo A;
4169     }
4170     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4171     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4172     ## before state", "DOCTYPE ATTLIST name before state".
4173    
4174     if ($is_space->{$self->{nc}}) {
4175     ## Stay in the state.
4176     !!!next-input-character;
4177     redo A;
4178     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4179     $self->{nc} == 0x0025) { # %
4180     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4181     !!!next-input-character;
4182     redo A;
4183     } elsif ($self->{nc} == 0x003E) { # >
4184     ## XML5: Same as "Anything else".
4185     !!!parse-error (type => 'no md name'); ## TODO: type
4186     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4187     !!!next-input-character;
4188     redo A;
4189     } elsif ($self->{nc} == -1) {
4190     !!!parse-error (type => 'unclosed md'); ## TODO: type
4191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4192     ## Reconsume.
4193     redo A;
4194     } else {
4195     ## XML5: [ATTLIST] Not defined yet.
4196     $self->{ct}->{name} .= chr $self->{nc};
4197     $self->{state} = MD_NAME_STATE;
4198     !!!next-input-character;
4199     redo A;
4200     }
4201     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4202     if ($is_space->{$self->{nc}}) {
4203     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4204     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4205     $self->{state} = BEFORE_MD_NAME_STATE;
4206     !!!next-input-character;
4207     redo A;
4208     } elsif ($self->{nc} == 0x003E) { # >
4209     ## XML5: Same as "Anything else".
4210     !!!parse-error (type => 'no md name'); ## TODO: type
4211     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4212     !!!next-input-character;
4213     redo A;
4214     } elsif ($self->{nc} == -1) {
4215     !!!parse-error (type => 'unclosed md');
4216     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4217     ## Reconsume.
4218     redo A;
4219     } else {
4220     ## XML5: No parse error.
4221     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4222     $self->{state} = BOGUS_COMMENT_STATE;
4223     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4224     ## Reconsume.
4225     redo A;
4226     }
4227     } elsif ($self->{state} == MD_NAME_STATE) {
4228     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4229    
4230     if ($is_space->{$self->{nc}}) {
4231 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4232     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4233     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4234 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4235 wakaba 1.16 } else { # ENTITY/NOTATION
4236     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4237     }
4238 wakaba 1.14 !!!next-input-character;
4239     redo A;
4240     } elsif ($self->{nc} == 0x003E) { # >
4241     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4242     #
4243     } else {
4244 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4245 wakaba 1.14 }
4246     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247     !!!next-input-character;
4248     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4249     redo A;
4250     } elsif ($self->{nc} == -1) {
4251     ## XML5: [ATTLIST] No parse error.
4252     !!!parse-error (type => 'unclosed md');
4253     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254     ## Reconsume.
4255     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4256     redo A;
4257     } else {
4258     ## XML5: [ATTLIST] Not defined yet.
4259     $self->{ct}->{name} .= chr $self->{nc};
4260     ## Stay in the state.
4261     !!!next-input-character;
4262     redo A;
4263     }
4264     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4265     if ($is_space->{$self->{nc}}) {
4266     ## Stay in the state.
4267     !!!next-input-character;
4268     redo A;
4269     } elsif ($self->{nc} == 0x003E) { # >
4270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4271     !!!next-input-character;
4272     !!!emit ($self->{ct}); # ATTLIST
4273     redo A;
4274     } elsif ($self->{nc} == -1) {
4275     ## XML5: No parse error.
4276     !!!parse-error (type => 'unclosed md'); ## TODO: type
4277     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4278 wakaba 1.15 !!!emit ($self->{ct});
4279     redo A;
4280     } else {
4281     ## XML5: Not defined yet.
4282     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4283     tokens => [],
4284     line => $self->{line}, column => $self->{column}};
4285     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4286     !!!next-input-character;
4287     redo A;
4288     }
4289     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4290     if ($is_space->{$self->{nc}}) {
4291     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4292     !!!next-input-character;
4293     redo A;
4294     } elsif ($self->{nc} == 0x003E) { # >
4295     ## XML5: Same as "anything else".
4296     !!!parse-error (type => 'no attr type'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298     !!!next-input-character;
4299     !!!emit ($self->{ct}); # ATTLIST
4300     redo A;
4301     } elsif ($self->{nc} == 0x0028) { # (
4302     ## XML5: Same as "anything else".
4303     !!!parse-error (type => 'no space before paren'); ## TODO: type
4304     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4305     !!!next-input-character;
4306     redo A;
4307     } elsif ($self->{nc} == -1) {
4308     ## XML5: No parse error.
4309     !!!parse-error (type => 'unclosed md'); ## TODO: type
4310     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4311     !!!next-input-character;
4312     !!!emit ($self->{ct}); # ATTLIST
4313     redo A;
4314     } else {
4315     ## XML5: Not defined yet.
4316     $self->{ca}->{name} .= chr $self->{nc};
4317     ## Stay in the state.
4318     !!!next-input-character;
4319     redo A;
4320     }
4321     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4322     if ($is_space->{$self->{nc}}) {
4323     ## Stay in the state.
4324     !!!next-input-character;
4325     redo A;
4326     } elsif ($self->{nc} == 0x003E) { # >
4327     ## XML5: Same as "anything else".
4328     !!!parse-error (type => 'no attr type'); ## TODO: type
4329     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4330     !!!next-input-character;
4331     !!!emit ($self->{ct}); # ATTLIST
4332     redo A;
4333     } elsif ($self->{nc} == 0x0028) { # (
4334     ## XML5: Same as "anything else".
4335     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4336     !!!next-input-character;
4337     redo A;
4338     } elsif ($self->{nc} == -1) {
4339     ## XML5: No parse error.
4340     !!!parse-error (type => 'unclosed md'); ## TODO: type
4341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4342     !!!next-input-character;
4343     !!!emit ($self->{ct});
4344 wakaba 1.14 redo A;
4345     } else {
4346     ## XML5: Not defined yet.
4347 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4348     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4349     !!!next-input-character;
4350     redo A;
4351     }
4352     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4353     if ($is_space->{$self->{nc}}) {
4354     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4355     !!!next-input-character;
4356     redo A;
4357     } elsif ($self->{nc} == 0x0023) { # #
4358     ## XML5: Same as "anything else".
4359     !!!parse-error (type => 'no space before default value'); ## TODO: type
4360     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4361     !!!next-input-character;
4362     redo A;
4363     } elsif ($self->{nc} == 0x0022) { # "
4364     ## XML5: Same as "anything else".
4365     !!!parse-error (type => 'no space before default value'); ## TODO: type
4366     $self->{ca}->{value} = '';
4367     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4368     !!!next-input-character;
4369     redo A;
4370     } elsif ($self->{nc} == 0x0027) { # '
4371     ## XML5: Same as "anything else".
4372     !!!parse-error (type => 'no space before default value'); ## TODO: type
4373     $self->{ca}->{value} = '';
4374     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4375     !!!next-input-character;
4376     redo A;
4377     } elsif ($self->{nc} == 0x003E) { # >
4378     ## XML5: Same as "anything else".
4379     !!!parse-error (type => 'no attr default'); ## TODO: type
4380     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381     !!!next-input-character;
4382     !!!emit ($self->{ct}); # ATTLIST
4383     redo A;
4384     } elsif ($self->{nc} == 0x0028) { # (
4385     ## XML5: Same as "anything else".
4386     !!!parse-error (type => 'no space before paren'); ## TODO: type
4387     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4388     !!!next-input-character;
4389     redo A;
4390     } elsif ($self->{nc} == -1) {
4391     ## XML5: No parse error.
4392     !!!parse-error (type => 'unclosed md'); ## TODO: type
4393     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4394     !!!next-input-character;
4395     !!!emit ($self->{ct});
4396     redo A;
4397     } else {
4398     ## XML5: Not defined yet.
4399     $self->{ca}->{type} .= chr $self->{nc};
4400     ## Stay in the state.
4401     !!!next-input-character;
4402     redo A;
4403     }
4404     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4405     if ($is_space->{$self->{nc}}) {
4406     ## Stay in the state.
4407     !!!next-input-character;
4408     redo A;
4409     } elsif ($self->{nc} == 0x0028) { # (
4410     ## XML5: Same as "anything else".
4411     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4412     !!!next-input-character;
4413     redo A;
4414     } elsif ($self->{nc} == 0x0023) { # #
4415     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4416     !!!next-input-character;
4417     redo A;
4418     } elsif ($self->{nc} == 0x0022) { # "
4419     ## XML5: Same as "anything else".
4420     $self->{ca}->{value} = '';
4421     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4422     !!!next-input-character;
4423     redo A;
4424     } elsif ($self->{nc} == 0x0027) { # '
4425     ## XML5: Same as "anything else".
4426     $self->{ca}->{value} = '';
4427     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4428     !!!next-input-character;
4429     redo A;
4430     } elsif ($self->{nc} == 0x003E) { # >
4431     ## XML5: Same as "anything else".
4432     !!!parse-error (type => 'no attr default'); ## TODO: type
4433     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434     !!!next-input-character;
4435     !!!emit ($self->{ct}); # ATTLIST
4436     redo A;
4437     } elsif ($self->{nc} == -1) {
4438     ## XML5: No parse error.
4439     !!!parse-error (type => 'unclosed md'); ## TODO: type
4440     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4441     !!!next-input-character;
4442     !!!emit ($self->{ct});
4443     redo A;
4444     } else {
4445     ## XML5: Switch to the "DOCTYPE bogus comment state".
4446     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4447     $self->{ca}->{value} = '';
4448     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4449     ## Reconsume.
4450     redo A;
4451     }
4452     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4453     if ($is_space->{$self->{nc}}) {
4454     ## Stay in the state.
4455     !!!next-input-character;
4456     redo A;
4457     } elsif ($self->{nc} == 0x007C) { # |
4458     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4459     ## Stay in the state.
4460     !!!next-input-character;
4461     redo A;
4462     } elsif ($self->{nc} == 0x0029) { # )
4463     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4464     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4465     !!!next-input-character;
4466     redo A;
4467     } elsif ($self->{nc} == 0x003E) { # >
4468     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4469     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470     !!!next-input-character;
4471     !!!emit ($self->{ct}); # ATTLIST
4472     redo A;
4473     } elsif ($self->{nc} == -1) {
4474     ## XML5: No parse error.
4475     !!!parse-error (type => 'unclosed md'); ## TODO: type
4476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4477     !!!next-input-character;
4478     !!!emit ($self->{ct});
4479     redo A;
4480     } else {
4481     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4482     $self->{state} = ALLOWED_TOKEN_STATE;
4483     !!!next-input-character;
4484     redo A;
4485     }
4486     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4487     if ($is_space->{$self->{nc}}) {
4488     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4489     !!!next-input-character;
4490     redo A;
4491     } elsif ($self->{nc} == 0x007C) { # |
4492     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4493     !!!next-input-character;
4494     redo A;
4495     } elsif ($self->{nc} == 0x0029) { # )
4496     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4497     !!!next-input-character;
4498     redo A;
4499     } elsif ($self->{nc} == 0x003E) { # >
4500     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4501     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4502     !!!next-input-character;
4503     !!!emit ($self->{ct}); # ATTLIST
4504     redo A;
4505     } elsif ($self->{nc} == -1) {
4506     ## XML5: No parse error.
4507     !!!parse-error (type => 'unclosed md'); ## TODO: type
4508     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4509     !!!next-input-character;
4510     !!!emit ($self->{ct});
4511     redo A;
4512     } else {
4513     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4514     ## Stay in the state.
4515     !!!next-input-character;
4516     redo A;
4517     }
4518     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4519     if ($is_space->{$self->{nc}}) {
4520     ## Stay in the state.
4521     !!!next-input-character;
4522     redo A;
4523     } elsif ($self->{nc} == 0x007C) { # |
4524     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4525     !!!next-input-character;
4526     redo A;
4527     } elsif ($self->{nc} == 0x0029) { # )
4528     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4529     !!!next-input-character;
4530     redo A;
4531     } elsif ($self->{nc} == 0x003E) { # >
4532     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4534     !!!next-input-character;
4535     !!!emit ($self->{ct}); # ATTLIST
4536     redo A;
4537     } elsif ($self->{nc} == -1) {
4538     ## XML5: No parse error.
4539     !!!parse-error (type => 'unclosed md'); ## TODO: type
4540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4541     !!!next-input-character;
4542     !!!emit ($self->{ct});
4543     redo A;
4544     } else {
4545     !!!parse-error (type => 'space in allowed token', ## TODO: type
4546     line => $self->{line_prev},
4547     column => $self->{column_prev});
4548     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4549     $self->{state} = ALLOWED_TOKEN_STATE;
4550     !!!next-input-character;
4551     redo A;
4552     }
4553     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4554     if ($is_space->{$self->{nc}}) {
4555     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4556     !!!next-input-character;
4557     redo A;
4558     } elsif ($self->{nc} == 0x0023) { # #
4559     !!!parse-error (type => 'no space before default value'); ## TODO: type
4560     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4561     !!!next-input-character;
4562     redo A;
4563     } elsif ($self->{nc} == 0x0022) { # "
4564     !!!parse-error (type => 'no space before default value'); ## TODO: type
4565     $self->{ca}->{value} = '';
4566     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4567     !!!next-input-character;
4568     redo A;
4569     } elsif ($self->{nc} == 0x0027) { # '
4570     !!!parse-error (type => 'no space before default value'); ## TODO: type
4571     $self->{ca}->{value} = '';
4572     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4573     !!!next-input-character;
4574     redo A;
4575     } elsif ($self->{nc} == 0x003E) { # >
4576     !!!parse-error (type => 'no attr default'); ## TODO: type
4577     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4578     !!!next-input-character;
4579     !!!emit ($self->{ct}); # ATTLIST
4580     redo A;
4581     } elsif ($self->{nc} == -1) {
4582     !!!parse-error (type => 'unclosed md'); ## TODO: type
4583     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4584     !!!next-input-character;
4585     !!!emit ($self->{ct});
4586     redo A;
4587     } else {
4588     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4589     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4590     ## Reconsume.
4591     redo A;
4592     }
4593     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4594     if ($is_space->{$self->{nc}}) {
4595     ## Stay in the state.
4596     !!!next-input-character;
4597     redo A;
4598     } elsif ($self->{nc} == 0x0023) { # #
4599     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4600     !!!next-input-character;
4601     redo A;
4602     } elsif ($self->{nc} == 0x0022) { # "
4603     $self->{ca}->{value} = '';
4604     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4605     !!!next-input-character;
4606     redo A;
4607     } elsif ($self->{nc} == 0x0027) { # '
4608     $self->{ca}->{value} = '';
4609     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4610     !!!next-input-character;
4611     redo A;
4612     } elsif ($self->{nc} == 0x003E) { # >
4613     !!!parse-error (type => 'no attr default'); ## TODO: type
4614     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4615     !!!next-input-character;
4616     !!!emit ($self->{ct}); # ATTLIST
4617     redo A;
4618     } elsif ($self->{nc} == -1) {
4619     !!!parse-error (type => 'unclosed md'); ## TODO: type
4620     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4621     !!!next-input-character;
4622     !!!emit ($self->{ct});
4623     redo A;
4624     } else {
4625     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4626     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4627     ## Reconsume.
4628     redo A;
4629     }
4630     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4631     if ($is_space->{$self->{nc}}) {
4632     ## XML5: No parse error.
4633     !!!parse-error (type => 'no default type'); ## TODO: type
4634 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4635 wakaba 1.14 ## Reconsume.
4636     redo A;
4637 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4638     ## XML5: Same as "anything else".
4639     $self->{ca}->{value} = '';
4640     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4641     !!!next-input-character;
4642     redo A;
4643     } elsif ($self->{nc} == 0x0027) { # '
4644     ## XML5: Same as "anything else".
4645     $self->{ca}->{value} = '';
4646     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4647     !!!next-input-character;
4648     redo A;
4649     } elsif ($self->{nc} == 0x003E) { # >
4650     ## XML5: Same as "anything else".
4651     !!!parse-error (type => 'no attr default'); ## TODO: type
4652     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653     !!!next-input-character;
4654     !!!emit ($self->{ct}); # ATTLIST
4655     redo A;
4656     } elsif ($self->{nc} == -1) {
4657     ## XML5: No parse error.
4658     !!!parse-error (type => 'unclosed md'); ## TODO: type
4659     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4660     !!!next-input-character;
4661     !!!emit ($self->{ct});
4662     redo A;
4663     } else {
4664     $self->{ca}->{default} = chr $self->{nc};
4665     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4666     !!!next-input-character;
4667     redo A;
4668 wakaba 1.14 }
4669 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4670     if ($is_space->{$self->{nc}}) {
4671     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4672     !!!next-input-character;
4673     redo A;
4674     } elsif ($self->{nc} == 0x0022) { # "
4675     ## XML5: Same as "anything else".
4676     !!!parse-error (type => 'no space before default value'); ## TODO: type
4677     $self->{ca}->{value} = '';
4678     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4679     !!!next-input-character;
4680     redo A;
4681     } elsif ($self->{nc} == 0x0027) { # '
4682     ## XML5: Same as "anything else".
4683     !!!parse-error (type => 'no space before default value'); ## TODO: type
4684     $self->{ca}->{value} = '';
4685     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4686     !!!next-input-character;
4687     redo A;
4688     } elsif ($self->{nc} == 0x003E) { # >
4689     ## XML5: Same as "anything else".
4690     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4691     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4692     !!!next-input-character;
4693     !!!emit ($self->{ct}); # ATTLIST
4694     redo A;
4695     } elsif ($self->{nc} == -1) {
4696     ## XML5: No parse error.
4697     !!!parse-error (type => 'unclosed md'); ## TODO: type
4698     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4699     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4700     !!!next-input-character;
4701     !!!emit ($self->{ct});
4702     redo A;
4703     } else {
4704     $self->{ca}->{default} .= chr $self->{nc};
4705     ## Stay in the state.
4706     !!!next-input-character;
4707     redo A;
4708     }
4709     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4710     if ($is_space->{$self->{nc}}) {
4711     ## Stay in the state.
4712     !!!next-input-character;
4713     redo A;
4714     } elsif ($self->{nc} == 0x0022) { # "
4715     $self->{ca}->{value} = '';
4716     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4717     !!!next-input-character;
4718     redo A;
4719     } elsif ($self->{nc} == 0x0027) { # '
4720     $self->{ca}->{value} = '';
4721     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4722     !!!next-input-character;
4723     redo A;
4724     } elsif ($self->{nc} == 0x003E) { # >
4725     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4726     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4727     !!!next-input-character;
4728     !!!emit ($self->{ct}); # ATTLIST
4729     redo A;
4730     } elsif ($self->{nc} == -1) {
4731     ## XML5: No parse error.
4732     !!!parse-error (type => 'unclosed md'); ## TODO: type
4733     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4734     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4735     !!!next-input-character;
4736     !!!emit ($self->{ct});
4737     redo A;
4738     } else {
4739     ## XML5: Not defined yet.
4740     if ($self->{ca}->{default} eq 'FIXED') {
4741     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4742     } else {
4743     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4744     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4745     }
4746     ## Reconsume.
4747     redo A;
4748     }
4749     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4750     if ($is_space->{$self->{nc}} or
4751     $self->{nc} == -1 or
4752     $self->{nc} == 0x003E) { # >
4753     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4754     ## Reconsume.
4755     redo A;
4756     } else {
4757     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4758     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4759     ## Reconsume.
4760     redo A;
4761 wakaba 1.16 }
4762 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4763     ## ASCII case-insensitive
4764     if ($self->{nc} == [
4765     undef,
4766     0x0044, # D
4767     0x0041, # A
4768     0x0054, # T
4769     ]->[length $self->{kwd}] or
4770     $self->{nc} == [
4771     undef,
4772     0x0064, # d
4773     0x0061, # a
4774     0x0074, # t
4775     ]->[length $self->{kwd}]) {
4776     !!!cp (172.2);
4777     ## Stay in the state.
4778     $self->{kwd} .= chr $self->{nc};
4779     !!!next-input-character;
4780     redo A;
4781     } elsif ((length $self->{kwd}) == 4 and
4782     ($self->{nc} == 0x0041 or # A
4783     $self->{nc} == 0x0061)) { # a
4784     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4785     !!!cp (172.3);
4786     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4787     text => 'NDATA',
4788     line => $self->{line_prev},
4789     column => $self->{column_prev} - 4);
4790     } else {
4791     !!!cp (172.4);
4792     }
4793     $self->{state} = AFTER_NDATA_STATE;
4794     !!!next-input-character;
4795     redo A;
4796     } else {
4797     !!!parse-error (type => 'string after literal', ## TODO: type
4798     line => $self->{line_prev},
4799     column => $self->{column_prev} + 1
4800     - length $self->{kwd});
4801     !!!cp (172.5);
4802     $self->{state} = BOGUS_MD_STATE;
4803     ## Reconsume.
4804     redo A;
4805     }
4806     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4807     if ($is_space->{$self->{nc}}) {
4808     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4809     !!!next-input-character;
4810     redo A;
4811     } elsif ($self->{nc} == 0x003E) { # >
4812     !!!parse-error (type => 'no notation name'); ## TODO: type
4813     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4814     !!!next-input-character;
4815     !!!emit ($self->{ct}); # ENTITY
4816     redo A;
4817     } elsif ($self->{nc} == -1) {
4818     !!!parse-error (type => 'unclosed md'); ## TODO: type
4819     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4820     !!!next-input-character;
4821     !!!emit ($self->{ct}); # ENTITY
4822     redo A;
4823     } else {
4824     !!!parse-error (type => 'string after literal', ## TODO: type
4825     line => $self->{line_prev},
4826     column => $self->{column_prev} + 1
4827     - length $self->{kwd});
4828     $self->{state} = BOGUS_MD_STATE;
4829     ## Reconsume.
4830     redo A;
4831     }
4832     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4833     if ($is_space->{$self->{nc}}) {
4834     ## Stay in the state.
4835     !!!next-input-character;
4836     redo A;
4837     } elsif ($self->{nc} == 0x003E) { # >
4838     !!!parse-error (type => 'no notation name'); ## TODO: type
4839     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4840     !!!next-input-character;
4841     !!!emit ($self->{ct}); # ENTITY
4842     redo A;
4843     } elsif ($self->{nc} == -1) {
4844     !!!parse-error (type => 'unclosed md'); ## TODO: type
4845     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4846     !!!next-input-character;
4847     !!!emit ($self->{ct}); # ENTITY
4848     redo A;
4849     } else {
4850     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4851     $self->{state} = NOTATION_NAME_STATE;
4852     !!!next-input-character;
4853     redo A;
4854     }
4855     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4856     if ($is_space->{$self->{nc}}) {
4857 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4858 wakaba 1.18 !!!next-input-character;
4859     redo A;
4860     } elsif ($self->{nc} == 0x003E) { # >
4861     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4862     !!!next-input-character;
4863     !!!emit ($self->{ct}); # ENTITY
4864     redo A;
4865     } elsif ($self->{nc} == -1) {
4866     !!!parse-error (type => 'unclosed md'); ## TODO: type
4867     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4868     !!!next-input-character;
4869     !!!emit ($self->{ct}); # ENTITY
4870     redo A;
4871     } else {
4872     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4873     ## Stay in the state.
4874     !!!next-input-character;
4875     redo A;
4876     }
4877 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4878     if ($self->{nc} == 0x0022) { # "
4879 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4880 wakaba 1.19 !!!next-input-character;
4881     redo A;
4882     } elsif ($self->{nc} == 0x0026) { # &
4883     $self->{prev_state} = $self->{state};
4884     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4885     $self->{entity_add} = 0x0022; # "
4886     !!!next-input-character;
4887     redo A;
4888     ## TODO: %
4889     } elsif ($self->{nc} == -1) {
4890     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4891     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4892     ## Reconsume.
4893     !!!emit ($self->{ct}); # ENTITY
4894     redo A;
4895     } else {
4896     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4897     !!!next-input-character;
4898     redo A;
4899     }
4900     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4901     if ($self->{nc} == 0x0027) { # '
4902 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4903 wakaba 1.19 !!!next-input-character;
4904     redo A;
4905     } elsif ($self->{nc} == 0x0026) { # &
4906     $self->{prev_state} = $self->{state};
4907     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4908     $self->{entity_add} = 0x0027; # '
4909     !!!next-input-character;
4910     redo A;
4911     ## TODO: %
4912     } elsif ($self->{nc} == -1) {
4913     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4914     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915     ## Reconsume.
4916     !!!emit ($self->{ct}); # ENTITY
4917     redo A;
4918     } else {
4919     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4920     !!!next-input-character;
4921     redo A;
4922     }
4923     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4924     if ($is_space->{$self->{nc}} or
4925     {
4926     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4927     $self->{entity_add} => 1,
4928     }->{$self->{nc}}) {
4929 wakaba 1.22 !!!parse-error (type => 'bare ero',
4930     line => $self->{line_prev},
4931     column => $self->{column_prev}
4932     + ($self->{nc} == -1 ? 1 : 0));
4933 wakaba 1.19 ## Don't consume
4934     ## Return nothing.
4935     #
4936     } elsif ($self->{nc} == 0x0023) { # #
4937     $self->{ca} = $self->{ct};
4938     $self->{state} = ENTITY_HASH_STATE;
4939     $self->{kwd} = '#';
4940     !!!next-input-character;
4941     redo A;
4942     } else {
4943     #
4944     }
4945    
4946     $self->{ct}->{value} .= '&';
4947     $self->{state} = $self->{prev_state};
4948     ## Reconsume.
4949     redo A;
4950 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4951     if ($is_space->{$self->{nc}}) {
4952     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4953     !!!next-input-character;
4954     redo A;
4955     } elsif ($self->{nc} == 0x0028) { # (
4956     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4957     $self->{ct}->{content} = ['('];
4958     $self->{group_depth} = 1;
4959     !!!next-input-character;
4960     redo A;
4961     } elsif ($self->{nc} == 0x003E) { # >
4962     !!!parse-error (type => 'no md def'); ## TODO: type
4963     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4964     !!!next-input-character;
4965     !!!emit ($self->{ct}); # ELEMENT
4966     redo A;
4967     } elsif ($self->{nc} == -1) {
4968     !!!parse-error (type => 'unclosed md'); ## TODO: type
4969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4970     !!!next-input-character;
4971     !!!emit ($self->{ct}); # ELEMENT
4972     redo A;
4973     } else {
4974     $self->{ct}->{content} = [chr $self->{nc}];
4975     $self->{state} = CONTENT_KEYWORD_STATE;
4976     !!!next-input-character;
4977     redo A;
4978     }
4979     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4980     if ($is_space->{$self->{nc}}) {
4981     $self->{state} = AFTER_MD_DEF_STATE;
4982     !!!next-input-character;
4983     redo A;
4984     } elsif ($self->{nc} == 0x003E) { # >
4985     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4986     !!!next-input-character;
4987     !!!emit ($self->{ct}); # ELEMENT
4988     redo A;
4989     } elsif ($self->{nc} == -1) {
4990     !!!parse-error (type => 'unclosed md'); ## TODO: type
4991     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4992     !!!next-input-character;
4993     !!!emit ($self->{ct}); # ELEMENT
4994     redo A;
4995     } else {
4996     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4997     ## Stay in the state.
4998     !!!next-input-character;
4999     redo A;
5000     }
5001     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
5002     if ($is_space->{$self->{nc}}) {
5003     ## Stay in the state.
5004     !!!next-input-character;
5005     redo A;
5006     } elsif ($self->{nc} == 0x0028) { # (
5007     $self->{group_depth}++;
5008     push @{$self->{ct}->{content}}, chr $self->{nc};
5009     ## Stay in the state.
5010     !!!next-input-character;
5011     redo A;
5012     } elsif ($self->{nc} == 0x007C or # |
5013     $self->{nc} == 0x002C) { # ,
5014     !!!parse-error (type => 'empty element name'); ## TODO: type
5015     ## Stay in the state.
5016     !!!next-input-character;
5017     redo A;
5018     } elsif ($self->{nc} == 0x0029) { # )
5019     !!!parse-error (type => 'empty element name'); ## TODO: type
5020     push @{$self->{ct}->{content}}, chr $self->{nc};
5021     $self->{group_depth}--;
5022     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5023     !!!next-input-character;
5024     redo A;
5025     } elsif ($self->{nc} == 0x003E) { # >
5026     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5027     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5028     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5029     !!!next-input-character;
5030     !!!emit ($self->{ct}); # ELEMENT
5031     redo A;
5032     } elsif ($self->{nc} == -1) {
5033     !!!parse-error (type => 'unclosed md'); ## TODO: type
5034     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5036     !!!next-input-character;
5037     !!!emit ($self->{ct}); # ELEMENT
5038     redo A;
5039     } else {
5040     push @{$self->{ct}->{content}}, chr $self->{nc};
5041     $self->{state} = CM_ELEMENT_NAME_STATE;
5042     !!!next-input-character;
5043     redo A;
5044     }
5045     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
5046     if ($is_space->{$self->{nc}}) {
5047     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5048     !!!next-input-character;
5049     redo A;
5050     } elsif ($self->{nc} == 0x002A or # *
5051     $self->{nc} == 0x002B or # +
5052     $self->{nc} == 0x003F) { # ?
5053     push @{$self->{ct}->{content}}, chr $self->{nc};
5054     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5055     !!!next-input-character;
5056     redo A;
5057     } elsif ($self->{nc} == 0x007C or # |
5058     $self->{nc} == 0x002C) { # ,
5059     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5060     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5061     !!!next-input-character;
5062     redo A;
5063     } elsif ($self->{nc} == 0x0029) { # )
5064     $self->{group_depth}--;
5065     push @{$self->{ct}->{content}}, chr $self->{nc};
5066     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5067     !!!next-input-character;
5068     redo A;
5069     } elsif ($self->{nc} == 0x003E) { # >
5070     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5071     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5072     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5073     !!!next-input-character;
5074     !!!emit ($self->{ct}); # ELEMENT
5075     redo A;
5076     } elsif ($self->{nc} == -1) {
5077     !!!parse-error (type => 'unclosed md'); ## TODO: type
5078     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5079     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5080     !!!next-input-character;
5081     !!!emit ($self->{ct}); # ELEMENT
5082     redo A;
5083     } else {
5084     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5085     ## Stay in the state.
5086     !!!next-input-character;
5087     redo A;
5088     }
5089     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5090     if ($is_space->{$self->{nc}}) {
5091     ## Stay in the state.
5092     !!!next-input-character;
5093     redo A;
5094     } elsif ($self->{nc} == 0x007C or # |
5095     $self->{nc} == 0x002C) { # ,
5096     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5097     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5098     !!!next-input-character;
5099     redo A;
5100     } elsif ($self->{nc} == 0x0029) { # )
5101     $self->{group_depth}--;
5102     push @{$self->{ct}->{content}}, chr $self->{nc};
5103     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5104     !!!next-input-character;
5105     redo A;
5106     } elsif ($self->{nc} == 0x003E) { # >
5107     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5108     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5110     !!!next-input-character;
5111     !!!emit ($self->{ct}); # ELEMENT
5112     redo A;
5113     } elsif ($self->{nc} == -1) {
5114     !!!parse-error (type => 'unclosed md'); ## TODO: type
5115     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5116     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5117     !!!next-input-character;
5118     !!!emit ($self->{ct}); # ELEMENT
5119     redo A;
5120     } else {
5121     !!!parse-error (type => 'after element name'); ## TODO: type
5122     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5123     $self->{state} = BOGUS_MD_STATE;
5124     !!!next-input-character;
5125     redo A;
5126     }
5127     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5128     if ($is_space->{$self->{nc}}) {
5129     if ($self->{group_depth}) {
5130     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5131     } else {
5132     $self->{state} = AFTER_MD_DEF_STATE;
5133     }
5134     !!!next-input-character;
5135     redo A;
5136     } elsif ($self->{nc} == 0x002A or # *
5137     $self->{nc} == 0x002B or # +
5138     $self->{nc} == 0x003F) { # ?
5139     push @{$self->{ct}->{content}}, chr $self->{nc};
5140     if ($self->{group_depth}) {
5141     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5142     } else {
5143     $self->{state} = AFTER_MD_DEF_STATE;
5144     }
5145     !!!next-input-character;
5146     redo A;
5147     } elsif ($self->{nc} == 0x0029) { # )
5148     if ($self->{group_depth}) {
5149     $self->{group_depth}--;
5150     push @{$self->{ct}->{content}}, chr $self->{nc};
5151     ## Stay in the state.
5152     !!!next-input-character;
5153     redo A;
5154     } else {
5155     !!!parse-error (type => 'string after md def'); ## TODO: type
5156     $self->{state} = BOGUS_MD_STATE;
5157     ## Reconsume.
5158     redo A;
5159     }
5160     } elsif ($self->{nc} == 0x003E) { # >
5161     if ($self->{group_depth}) {
5162     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5163     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5164     }
5165     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5166     !!!next-input-character;
5167     !!!emit ($self->{ct}); # ELEMENT
5168     redo A;
5169     } elsif ($self->{nc} == -1) {
5170     !!!parse-error (type => 'unclosed md'); ## TODO: type
5171     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5172     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5173     !!!next-input-character;
5174     !!!emit ($self->{ct}); # ELEMENT
5175     redo A;
5176     } else {
5177     if ($self->{group_depth}) {
5178     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5179     } else {
5180     !!!parse-error (type => 'string after md def'); ## TODO: type
5181     $self->{state} = BOGUS_MD_STATE;
5182     }
5183     ## Reconsume.
5184     redo A;
5185     }
5186     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5187 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5188     ## Stay in the state.
5189     !!!next-input-character;
5190     redo A;
5191     } elsif ($self->{nc} == 0x003E) { # >
5192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5193     !!!next-input-character;
5194 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5195 wakaba 1.18 redo A;
5196     } elsif ($self->{nc} == -1) {
5197     !!!parse-error (type => 'unclosed md'); ## TODO: type
5198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5199     !!!next-input-character;
5200 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5201 wakaba 1.18 redo A;
5202     } else {
5203 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5204 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5205     ## Reconsume.
5206     redo A;
5207     }
5208 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5209     if ($self->{nc} == 0x003E) { # >
5210     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5211     !!!next-input-character;
5212     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5213     redo A;
5214     } elsif ($self->{nc} == -1) {
5215     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5216     ## Reconsume.
5217     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5218     redo A;
5219     } else {
5220     ## Stay in the state.
5221     !!!next-input-character;
5222     redo A;
5223     }
5224 wakaba 1.1 } else {
5225     die "$0: $self->{state}: Unknown state";
5226     }
5227     } # A
5228    
5229     die "$0: _get_next_token: unexpected case";
5230     } # _get_next_token
5231    
5232     1;
5233 wakaba 1.32 ## $Date: 2009/09/05 09:26:55 $
5234 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24