/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.29 - (hide annotations) (download) (as text)
Sun Aug 16 04:06:34 2009 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.28: +20 -5 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	16 Aug 2009 04:05:04 -0000
	* tree-test-1.dat, tree-test-3.dat, tree-test-flow.dat,
	tree-test-foreign.dat, tree-test-form.dat, tree-test-phrasing.dat,
	tokenizer-test-1.test, tokenizer-test-2.dat, tokenizer-test-3.dat:
	DOCTYPE names are now normalized to its lowercased form (HTML5
	revision 2502).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	16 Aug 2009 04:06:26 -0000
2009-08-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Lowercase-fold doctype names (HTML5 revision
	2501, cf. HTML5 revision 3571).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.29 our $VERSION=do{my @r=(q$Revision: 1.28 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 wakaba 1.1 !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319 wakaba 1.25 } elsif ($self->{is_xml} and
1320     $is_space->{$self->{nc}}) {
1321     !!!cp (97.1);
1322     $self->{ca}->{value} .= ' ';
1323     ## Stay in the state.
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.1 } elsif ($self->{nc} == -1) {
1327     !!!parse-error (type => 'unclosed attribute value');
1328     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329     !!!cp (97);
1330     $self->{last_stag_name} = $self->{ct}->{tag_name};
1331 wakaba 1.15
1332     $self->{state} = DATA_STATE;
1333     $self->{s_kwd} = '';
1334     ## reconsume
1335     !!!emit ($self->{ct}); # start tag
1336     redo A;
1337 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340     !!!cp (98);
1341     !!!parse-error (type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344     !!!cp (99);
1345     }
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # end tag
1351     redo A;
1352     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353     ## XML5: No parse error above; not defined yet.
1354     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356     ## Reconsume.
1357     !!!emit ($self->{ct}); # ATTLIST
1358     redo A;
1359 wakaba 1.1 } else {
1360     die "$0: $self->{ct}->{type}: Unknown token type";
1361     }
1362     } else {
1363 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1364 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365     !!!cp (100);
1366     ## XML5: Not a parse error.
1367     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368     } else {
1369     !!!cp (100.1);
1370     }
1371 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1372     $self->{read_until}->($self->{ca}->{value},
1373 wakaba 1.25 qq["&<\x09\x0C\x20],
1374 wakaba 1.1 length $self->{ca}->{value});
1375    
1376     ## Stay in the state
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382     ## ATTLIST attribute value single quoted state".
1383 wakaba 1.11
1384 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1385 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386     !!!cp (101.1);
1387     ## XML5: "DOCTYPE ATTLIST name after state".
1388     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390     } else {
1391     !!!cp (101);
1392     ## XML5: "Before attribute name state" (sic).
1393     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394     }
1395 wakaba 1.1 !!!next-input-character;
1396     redo A;
1397     } elsif ($self->{nc} == 0x0026) { # &
1398     !!!cp (102);
1399 wakaba 1.11 ## XML5: Not defined yet.
1400    
1401 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1402     ## "entity in attribute value state". In this implementation, the
1403     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404     ## implementation of the "consume a character reference" algorithm.
1405     $self->{entity_add} = 0x0027; # '
1406     $self->{prev_state} = $self->{state};
1407     $self->{state} = ENTITY_STATE;
1408     !!!next-input-character;
1409     redo A;
1410 wakaba 1.25 } elsif ($self->{is_xml} and
1411     $is_space->{$self->{nc}}) {
1412     !!!cp (103.1);
1413     $self->{ca}->{value} .= ' ';
1414     ## Stay in the state.
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.1 } elsif ($self->{nc} == -1) {
1418     !!!parse-error (type => 'unclosed attribute value');
1419     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420     !!!cp (103);
1421     $self->{last_stag_name} = $self->{ct}->{tag_name};
1422 wakaba 1.15
1423     $self->{state} = DATA_STATE;
1424     $self->{s_kwd} = '';
1425     ## reconsume
1426     !!!emit ($self->{ct}); # start tag
1427     redo A;
1428 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430     if ($self->{ct}->{attributes}) {
1431     !!!cp (104);
1432     !!!parse-error (type => 'end tag attribute');
1433     } else {
1434     ## NOTE: This state should never be reached.
1435     !!!cp (105);
1436     }
1437 wakaba 1.15
1438     $self->{state} = DATA_STATE;
1439     $self->{s_kwd} = '';
1440     ## reconsume
1441     !!!emit ($self->{ct}); # end tag
1442     redo A;
1443     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444     ## XML5: No parse error above; not defined yet.
1445     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447     ## Reconsume.
1448     !!!emit ($self->{ct}); # ATTLIST
1449     redo A;
1450 wakaba 1.1 } else {
1451     die "$0: $self->{ct}->{type}: Unknown token type";
1452     }
1453     } else {
1454 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1455 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456     !!!cp (106);
1457     ## XML5: Not a parse error.
1458     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459     } else {
1460     !!!cp (106.1);
1461     }
1462 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1463     $self->{read_until}->($self->{ca}->{value},
1464 wakaba 1.25 qq['&<\x09\x0C\x20],
1465 wakaba 1.1 length $self->{ca}->{value});
1466    
1467     ## Stay in the state
1468     !!!next-input-character;
1469     redo A;
1470     }
1471     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1473    
1474 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1475 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476     !!!cp (107.1);
1477     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479     } else {
1480     !!!cp (107);
1481     ## XML5: "Tag attribute name before state".
1482     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483     }
1484 wakaba 1.1 !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x0026) { # &
1487     !!!cp (108);
1488 wakaba 1.11
1489     ## XML5: Not defined yet.
1490    
1491 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1492     ## "entity in attribute value state". In this implementation, the
1493     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494     ## implementation of the "consume a character reference" algorithm.
1495     $self->{entity_add} = -1;
1496     $self->{prev_state} = $self->{state};
1497     $self->{state} = ENTITY_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x003E) { # >
1501     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502     !!!cp (109);
1503     $self->{last_stag_name} = $self->{ct}->{tag_name};
1504 wakaba 1.15
1505     $self->{state} = DATA_STATE;
1506     $self->{s_kwd} = '';
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # start tag
1509     redo A;
1510 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513     !!!cp (110);
1514     !!!parse-error (type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517     !!!cp (111);
1518     }
1519 wakaba 1.15
1520     $self->{state} = DATA_STATE;
1521     $self->{s_kwd} = '';
1522     !!!next-input-character;
1523     !!!emit ($self->{ct}); # end tag
1524     redo A;
1525     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528     !!!next-input-character;
1529     !!!emit ($self->{ct}); # ATTLIST
1530     redo A;
1531 wakaba 1.1 } else {
1532     die "$0: $self->{ct}->{type}: Unknown token type";
1533     }
1534     } elsif ($self->{nc} == -1) {
1535     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536     !!!cp (112);
1537 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1538 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539 wakaba 1.15
1540     $self->{state} = DATA_STATE;
1541     $self->{s_kwd} = '';
1542     ## reconsume
1543     !!!emit ($self->{ct}); # start tag
1544     redo A;
1545 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548     if ($self->{ct}->{attributes}) {
1549     !!!cp (113);
1550     !!!parse-error (type => 'end tag attribute');
1551     } else {
1552     ## NOTE: This state should never be reached.
1553     !!!cp (114);
1554     }
1555 wakaba 1.15
1556     $self->{state} = DATA_STATE;
1557     $self->{s_kwd} = '';
1558     ## reconsume
1559     !!!emit ($self->{ct}); # end tag
1560     redo A;
1561     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562     !!!parse-error (type => 'unclosed md'); ## TODO: type
1563     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565     ## Reconsume.
1566     !!!emit ($self->{ct}); # ATTLIST
1567     redo A;
1568 wakaba 1.1 } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     } else {
1572     if ({
1573     0x0022 => 1, # "
1574     0x0027 => 1, # '
1575     0x003D => 1, # =
1576 wakaba 1.26 0x003C => 1, # <
1577 wakaba 1.1 }->{$self->{nc}}) {
1578     !!!cp (115);
1579 wakaba 1.11 ## XML5: Not a parse error.
1580 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1581     } else {
1582     !!!cp (116);
1583     }
1584     $self->{ca}->{value} .= chr ($self->{nc});
1585     $self->{read_until}->($self->{ca}->{value},
1586 wakaba 1.25 qq["'=& \x09\x0C>],
1587 wakaba 1.1 length $self->{ca}->{value});
1588    
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     }
1593     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594     if ($is_space->{$self->{nc}}) {
1595     !!!cp (118);
1596     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == 0x003E) { # >
1600     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601     !!!cp (119);
1602     $self->{last_stag_name} = $self->{ct}->{tag_name};
1603     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605     if ($self->{ct}->{attributes}) {
1606     !!!cp (120);
1607     !!!parse-error (type => 'end tag attribute');
1608     } else {
1609     ## NOTE: This state should never be reached.
1610     !!!cp (121);
1611     }
1612     } else {
1613     die "$0: $self->{ct}->{type}: Unknown token type";
1614     }
1615     $self->{state} = DATA_STATE;
1616 wakaba 1.5 $self->{s_kwd} = '';
1617 wakaba 1.1 !!!next-input-character;
1618    
1619     !!!emit ($self->{ct}); # start tag or end tag
1620    
1621     redo A;
1622     } elsif ($self->{nc} == 0x002F) { # /
1623     !!!cp (122);
1624     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625     !!!next-input-character;
1626     redo A;
1627     } elsif ($self->{nc} == -1) {
1628     !!!parse-error (type => 'unclosed tag');
1629     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630     !!!cp (122.3);
1631     $self->{last_stag_name} = $self->{ct}->{tag_name};
1632     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp (122.1);
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     ## NOTE: This state should never be reached.
1638     !!!cp (122.2);
1639     }
1640     } else {
1641     die "$0: $self->{ct}->{type}: Unknown token type";
1642     }
1643     $self->{state} = DATA_STATE;
1644 wakaba 1.5 $self->{s_kwd} = '';
1645 wakaba 1.1 ## Reconsume.
1646     !!!emit ($self->{ct}); # start tag or end tag
1647     redo A;
1648     } else {
1649     !!!cp ('124.1');
1650     !!!parse-error (type => 'no space between attributes');
1651     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652     ## reconsume
1653     redo A;
1654     }
1655     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 wakaba 1.11 ## XML5: "Empty tag state".
1657    
1658 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1659     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     !!!cp ('124.2');
1661     !!!parse-error (type => 'nestc', token => $self->{ct});
1662     ## TODO: Different type than slash in start tag
1663     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664     if ($self->{ct}->{attributes}) {
1665     !!!cp ('124.4');
1666     !!!parse-error (type => 'end tag attribute');
1667     } else {
1668     !!!cp ('124.5');
1669     }
1670     ## TODO: Test |<title></title/>|
1671     } else {
1672     !!!cp ('124.3');
1673     $self->{self_closing} = 1;
1674     }
1675    
1676     $self->{state} = DATA_STATE;
1677 wakaba 1.5 $self->{s_kwd} = '';
1678 wakaba 1.1 !!!next-input-character;
1679    
1680     !!!emit ($self->{ct}); # start tag or end tag
1681    
1682     redo A;
1683     } elsif ($self->{nc} == -1) {
1684     !!!parse-error (type => 'unclosed tag');
1685     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686     !!!cp (124.7);
1687     $self->{last_stag_name} = $self->{ct}->{tag_name};
1688     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689     if ($self->{ct}->{attributes}) {
1690     !!!cp (124.5);
1691     !!!parse-error (type => 'end tag attribute');
1692     } else {
1693     ## NOTE: This state should never be reached.
1694     !!!cp (124.6);
1695     }
1696     } else {
1697     die "$0: $self->{ct}->{type}: Unknown token type";
1698     }
1699 wakaba 1.11 ## XML5: "Tag attribute name before state".
1700 wakaba 1.1 $self->{state} = DATA_STATE;
1701 wakaba 1.5 $self->{s_kwd} = '';
1702 wakaba 1.1 ## Reconsume.
1703     !!!emit ($self->{ct}); # start tag or end tag
1704     redo A;
1705     } else {
1706     !!!cp ('124.4');
1707     !!!parse-error (type => 'nestc');
1708     ## TODO: This error type is wrong.
1709     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710     ## Reconsume.
1711     redo A;
1712     }
1713     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715    
1716 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717     ## consumes characters one-by-one basis.
1718    
1719     if ($self->{nc} == 0x003E) { # >
1720 wakaba 1.13 if ($self->{in_subset}) {
1721     !!!cp (123);
1722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723     } else {
1724     !!!cp (124);
1725     $self->{state} = DATA_STATE;
1726     $self->{s_kwd} = '';
1727     }
1728 wakaba 1.1 !!!next-input-character;
1729    
1730     !!!emit ($self->{ct}); # comment
1731     redo A;
1732     } elsif ($self->{nc} == -1) {
1733 wakaba 1.13 if ($self->{in_subset}) {
1734     !!!cp (125.1);
1735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736     } else {
1737     !!!cp (125);
1738     $self->{state} = DATA_STATE;
1739     $self->{s_kwd} = '';
1740     }
1741 wakaba 1.1 ## reconsume
1742    
1743     !!!emit ($self->{ct}); # comment
1744     redo A;
1745     } else {
1746     !!!cp (126);
1747     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748     $self->{read_until}->($self->{ct}->{data},
1749     q[>],
1750     length $self->{ct}->{data});
1751    
1752     ## Stay in the state.
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 wakaba 1.14 ## XML5: "Markup declaration state".
1758 wakaba 1.1
1759     if ($self->{nc} == 0x002D) { # -
1760     !!!cp (133);
1761     $self->{state} = MD_HYPHEN_STATE;
1762     !!!next-input-character;
1763     redo A;
1764     } elsif ($self->{nc} == 0x0044 or # D
1765     $self->{nc} == 0x0064) { # d
1766     ## ASCII case-insensitive.
1767     !!!cp (130);
1768     $self->{state} = MD_DOCTYPE_STATE;
1769 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1770 wakaba 1.1 !!!next-input-character;
1771     redo A;
1772 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774     $self->{is_xml}) and
1775 wakaba 1.1 $self->{nc} == 0x005B) { # [
1776     !!!cp (135.4);
1777     $self->{state} = MD_CDATA_STATE;
1778 wakaba 1.12 $self->{kwd} = '[';
1779 wakaba 1.1 !!!next-input-character;
1780     redo A;
1781     } else {
1782     !!!cp (136);
1783     }
1784    
1785     !!!parse-error (type => 'bogus comment',
1786     line => $self->{line_prev},
1787     column => $self->{column_prev} - 1);
1788     ## Reconsume.
1789     $self->{state} = BOGUS_COMMENT_STATE;
1790     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791     line => $self->{line_prev},
1792     column => $self->{column_prev} - 1,
1793     };
1794     redo A;
1795     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796     if ($self->{nc} == 0x002D) { # -
1797     !!!cp (127);
1798     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 2,
1801     };
1802 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 wakaba 1.1 !!!next-input-character;
1804     redo A;
1805     } else {
1806     !!!cp (128);
1807     !!!parse-error (type => 'bogus comment',
1808     line => $self->{line_prev},
1809     column => $self->{column_prev} - 2);
1810     $self->{state} = BOGUS_COMMENT_STATE;
1811     ## Reconsume.
1812     $self->{ct} = {type => COMMENT_TOKEN,
1813     data => '-',
1814     line => $self->{line_prev},
1815     column => $self->{column_prev} - 2,
1816     };
1817     redo A;
1818     }
1819     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820     ## ASCII case-insensitive.
1821     if ($self->{nc} == [
1822     undef,
1823     0x004F, # O
1824     0x0043, # C
1825     0x0054, # T
1826     0x0059, # Y
1827     0x0050, # P
1828 wakaba 1.12 ]->[length $self->{kwd}] or
1829 wakaba 1.1 $self->{nc} == [
1830     undef,
1831     0x006F, # o
1832     0x0063, # c
1833     0x0074, # t
1834     0x0079, # y
1835     0x0070, # p
1836 wakaba 1.12 ]->[length $self->{kwd}]) {
1837 wakaba 1.1 !!!cp (131);
1838     ## Stay in the state.
1839 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1840 wakaba 1.1 !!!next-input-character;
1841     redo A;
1842 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1843 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1844     $self->{nc} == 0x0065)) { # e
1845 wakaba 1.12 if ($self->{is_xml} and
1846     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 wakaba 1.10 !!!cp (129);
1848     ## XML5: case-sensitive.
1849     !!!parse-error (type => 'lowercase keyword', ## TODO
1850     text => 'DOCTYPE',
1851     line => $self->{line_prev},
1852     column => $self->{column_prev} - 5);
1853     } else {
1854     !!!cp (129.1);
1855     }
1856 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1857     $self->{ct} = {type => DOCTYPE_TOKEN,
1858     quirks => 1,
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 7,
1861     };
1862     !!!next-input-character;
1863     redo A;
1864     } else {
1865     !!!cp (132);
1866     !!!parse-error (type => 'bogus comment',
1867     line => $self->{line_prev},
1868 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1870     ## Reconsume.
1871     $self->{ct} = {type => COMMENT_TOKEN,
1872 wakaba 1.12 data => $self->{kwd},
1873 wakaba 1.1 line => $self->{line_prev},
1874 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 wakaba 1.1 };
1876     redo A;
1877     }
1878     } elsif ($self->{state} == MD_CDATA_STATE) {
1879     if ($self->{nc} == {
1880     '[' => 0x0043, # C
1881     '[C' => 0x0044, # D
1882     '[CD' => 0x0041, # A
1883     '[CDA' => 0x0054, # T
1884     '[CDAT' => 0x0041, # A
1885 wakaba 1.12 }->{$self->{kwd}}) {
1886 wakaba 1.1 !!!cp (135.1);
1887     ## Stay in the state.
1888 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1889 wakaba 1.1 !!!next-input-character;
1890     redo A;
1891 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1892 wakaba 1.1 $self->{nc} == 0x005B) { # [
1893 wakaba 1.6 if ($self->{is_xml} and
1894     not $self->{tainted} and
1895     @{$self->{open_elements} or []} == 0) {
1896 wakaba 1.8 !!!cp (135.2);
1897 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1898     line => $self->{line_prev},
1899     column => $self->{column_prev} - 7);
1900     $self->{tainted} = 1;
1901 wakaba 1.8 } else {
1902     !!!cp (135.21);
1903 wakaba 1.6 }
1904    
1905 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1906     data => '',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev} - 7};
1909     $self->{state} = CDATA_SECTION_STATE;
1910     !!!next-input-character;
1911     redo A;
1912     } else {
1913     !!!cp (135.3);
1914     !!!parse-error (type => 'bogus comment',
1915     line => $self->{line_prev},
1916 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1918     ## Reconsume.
1919     $self->{ct} = {type => COMMENT_TOKEN,
1920 wakaba 1.12 data => $self->{kwd},
1921 wakaba 1.1 line => $self->{line_prev},
1922 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 wakaba 1.1 };
1924     redo A;
1925     }
1926     } elsif ($self->{state} == COMMENT_START_STATE) {
1927     if ($self->{nc} == 0x002D) { # -
1928     !!!cp (137);
1929     $self->{state} = COMMENT_START_DASH_STATE;
1930     !!!next-input-character;
1931     redo A;
1932     } elsif ($self->{nc} == 0x003E) { # >
1933     !!!parse-error (type => 'bogus comment');
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (138.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (138);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 !!!next-input-character;
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } elsif ($self->{nc} == -1) {
1948     !!!parse-error (type => 'unclosed comment');
1949 wakaba 1.13 if ($self->{in_subset}) {
1950     !!!cp (139.1);
1951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952     } else {
1953     !!!cp (139);
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     }
1957 wakaba 1.1 ## reconsume
1958    
1959     !!!emit ($self->{ct}); # comment
1960    
1961     redo A;
1962     } else {
1963     !!!cp (140);
1964     $self->{ct}->{data} # comment
1965     .= chr ($self->{nc});
1966     $self->{state} = COMMENT_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     }
1970     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971     if ($self->{nc} == 0x002D) { # -
1972     !!!cp (141);
1973     $self->{state} = COMMENT_END_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} == 0x003E) { # >
1977     !!!parse-error (type => 'bogus comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (142.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (142);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 !!!next-input-character;
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (143.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (143);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (144);
2008     $self->{ct}->{data} # comment
2009     .= '-' . chr ($self->{nc});
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == COMMENT_STATE) {
2015 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2016    
2017 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2018     !!!cp (145);
2019     $self->{state} = COMMENT_END_DASH_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     } elsif ($self->{nc} == -1) {
2023     !!!parse-error (type => 'unclosed comment');
2024 wakaba 1.13 if ($self->{in_subset}) {
2025     !!!cp (146.1);
2026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027     } else {
2028     !!!cp (146);
2029     $self->{state} = DATA_STATE;
2030     $self->{s_kwd} = '';
2031     }
2032 wakaba 1.1 ## reconsume
2033    
2034     !!!emit ($self->{ct}); # comment
2035    
2036     redo A;
2037     } else {
2038     !!!cp (147);
2039     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040     $self->{read_until}->($self->{ct}->{data},
2041     q[-],
2042     length $self->{ct}->{data});
2043    
2044     ## Stay in the state
2045     !!!next-input-character;
2046     redo A;
2047     }
2048     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050 wakaba 1.10
2051 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2052     !!!cp (148);
2053     $self->{state} = COMMENT_END_STATE;
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == -1) {
2057     !!!parse-error (type => 'unclosed comment');
2058 wakaba 1.13 if ($self->{in_subset}) {
2059     !!!cp (149.1);
2060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061     } else {
2062     !!!cp (149);
2063     $self->{state} = DATA_STATE;
2064     $self->{s_kwd} = '';
2065     }
2066 wakaba 1.1 ## reconsume
2067    
2068     !!!emit ($self->{ct}); # comment
2069    
2070     redo A;
2071     } else {
2072     !!!cp (150);
2073     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074     $self->{state} = COMMENT_STATE;
2075     !!!next-input-character;
2076     redo A;
2077     }
2078     } elsif ($self->{state} == COMMENT_END_STATE) {
2079 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080    
2081 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2082 wakaba 1.13 if ($self->{in_subset}) {
2083     !!!cp (151.1);
2084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085     } else {
2086     !!!cp (151);
2087     $self->{state} = DATA_STATE;
2088     $self->{s_kwd} = '';
2089     }
2090 wakaba 1.1 !!!next-input-character;
2091    
2092     !!!emit ($self->{ct}); # comment
2093    
2094     redo A;
2095     } elsif ($self->{nc} == 0x002D) { # -
2096     !!!cp (152);
2097 wakaba 1.10 ## XML5: Not a parse error.
2098 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev});
2101     $self->{ct}->{data} .= '-'; # comment
2102     ## Stay in the state
2103     !!!next-input-character;
2104     redo A;
2105     } elsif ($self->{nc} == -1) {
2106     !!!parse-error (type => 'unclosed comment');
2107 wakaba 1.13 if ($self->{in_subset}) {
2108     !!!cp (153.1);
2109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110     } else {
2111     !!!cp (153);
2112     $self->{state} = DATA_STATE;
2113     $self->{s_kwd} = '';
2114     }
2115 wakaba 1.1 ## reconsume
2116    
2117     !!!emit ($self->{ct}); # comment
2118    
2119     redo A;
2120     } else {
2121     !!!cp (154);
2122     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123     $self->{state} = COMMENT_STATE;
2124     !!!next-input-character;
2125     redo A;
2126     }
2127     } elsif ($self->{state} == DOCTYPE_STATE) {
2128     if ($is_space->{$self->{nc}}) {
2129     !!!cp (155);
2130     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2131     !!!next-input-character;
2132     redo A;
2133 wakaba 1.28 } elsif ($self->{nc} == -1) {
2134     !!!cp (155.1);
2135     !!!parse-error (type => 'unclosed DOCTYPE');
2136     $self->{ct}->{quirks} = 1;
2137    
2138     $self->{state} = DATA_STATE;
2139     ## Reconsume.
2140     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141    
2142     redo A;
2143 wakaba 1.1 } else {
2144     !!!cp (156);
2145 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2146 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2147     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2148     ## reconsume
2149     redo A;
2150     }
2151     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2152 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2153    
2154 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2155     !!!cp (157);
2156     ## Stay in the state
2157     !!!next-input-character;
2158     redo A;
2159     } elsif ($self->{nc} == 0x003E) { # >
2160     !!!cp (158);
2161 wakaba 1.12 ## XML5: No parse error.
2162 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2163     $self->{state} = DATA_STATE;
2164 wakaba 1.5 $self->{s_kwd} = '';
2165 wakaba 1.1 !!!next-input-character;
2166    
2167     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2168    
2169     redo A;
2170 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2171     !!!cp (158.1);
2172     $self->{ct}->{name} # DOCTYPE
2173     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2174     delete $self->{ct}->{quirks};
2175     $self->{state} = DOCTYPE_NAME_STATE;
2176     !!!next-input-character;
2177     redo A;
2178 wakaba 1.1 } elsif ($self->{nc} == -1) {
2179     !!!cp (159);
2180     !!!parse-error (type => 'no DOCTYPE name');
2181     $self->{state} = DATA_STATE;
2182 wakaba 1.5 $self->{s_kwd} = '';
2183 wakaba 1.1 ## reconsume
2184    
2185     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2186    
2187     redo A;
2188 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2189     !!!cp (159.1);
2190     !!!parse-error (type => 'no DOCTYPE name');
2191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2192 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2193     $self->{in_subset} = 1;
2194 wakaba 1.12 !!!next-input-character;
2195 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2196 wakaba 1.12 redo A;
2197 wakaba 1.1 } else {
2198     !!!cp (160);
2199     $self->{ct}->{name} = chr $self->{nc};
2200     delete $self->{ct}->{quirks};
2201     $self->{state} = DOCTYPE_NAME_STATE;
2202     !!!next-input-character;
2203     redo A;
2204     }
2205     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2206 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2207    
2208     ## ISSUE: Redundant "First," in the spec.
2209    
2210 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2211     !!!cp (161);
2212     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2213     !!!next-input-character;
2214     redo A;
2215     } elsif ($self->{nc} == 0x003E) { # >
2216     !!!cp (162);
2217     $self->{state} = DATA_STATE;
2218 wakaba 1.5 $self->{s_kwd} = '';
2219 wakaba 1.1 !!!next-input-character;
2220    
2221     !!!emit ($self->{ct}); # DOCTYPE
2222    
2223     redo A;
2224 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2225     !!!cp (162.1);
2226     $self->{ct}->{name} # DOCTYPE
2227     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2228     delete $self->{ct}->{quirks};
2229     ## Stay in the state.
2230     !!!next-input-character;
2231     redo A;
2232 wakaba 1.1 } elsif ($self->{nc} == -1) {
2233     !!!cp (163);
2234     !!!parse-error (type => 'unclosed DOCTYPE');
2235     $self->{state} = DATA_STATE;
2236 wakaba 1.5 $self->{s_kwd} = '';
2237 wakaba 1.1 ## reconsume
2238    
2239     $self->{ct}->{quirks} = 1;
2240     !!!emit ($self->{ct}); # DOCTYPE
2241    
2242     redo A;
2243 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2244     !!!cp (163.1);
2245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2246 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2247     $self->{in_subset} = 1;
2248 wakaba 1.12 !!!next-input-character;
2249 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2250 wakaba 1.12 redo A;
2251 wakaba 1.1 } else {
2252     !!!cp (164);
2253 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2254     ## Stay in the state.
2255 wakaba 1.1 !!!next-input-character;
2256     redo A;
2257     }
2258     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2259 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2260     ## state", but implemented differently.
2261    
2262 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2263     !!!cp (165);
2264     ## Stay in the state
2265     !!!next-input-character;
2266     redo A;
2267     } elsif ($self->{nc} == 0x003E) { # >
2268 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2269     !!!cp (166);
2270     $self->{state} = DATA_STATE;
2271     $self->{s_kwd} = '';
2272     } else {
2273     !!!cp (166.1);
2274     !!!parse-error (type => 'no md def'); ## TODO: type
2275     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2276     }
2277    
2278 wakaba 1.1 !!!next-input-character;
2279 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2280 wakaba 1.1 redo A;
2281     } elsif ($self->{nc} == -1) {
2282 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2283     !!!cp (167);
2284     !!!parse-error (type => 'unclosed DOCTYPE');
2285     $self->{state} = DATA_STATE;
2286     $self->{s_kwd} = '';
2287     $self->{ct}->{quirks} = 1;
2288     } else {
2289     !!!cp (167.12);
2290     !!!parse-error (type => 'unclosed md'); ## TODO: type
2291     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2292     }
2293    
2294     ## Reconsume.
2295     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2296 wakaba 1.1 redo A;
2297     } elsif ($self->{nc} == 0x0050 or # P
2298     $self->{nc} == 0x0070) { # p
2299 wakaba 1.12 !!!cp (167.1);
2300 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2301 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2302 wakaba 1.1 !!!next-input-character;
2303     redo A;
2304     } elsif ($self->{nc} == 0x0053 or # S
2305     $self->{nc} == 0x0073) { # s
2306 wakaba 1.12 !!!cp (167.2);
2307 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2308 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2309     !!!next-input-character;
2310     redo A;
2311 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2312     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2313     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2314     !!!cp (167.21);
2315     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2316     $self->{ct}->{value} = ''; # ENTITY
2317     !!!next-input-character;
2318     redo A;
2319     } elsif ($self->{nc} == 0x0027 and # '
2320     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2321     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2322     !!!cp (167.22);
2323     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2324     $self->{ct}->{value} = ''; # ENTITY
2325     !!!next-input-character;
2326     redo A;
2327 wakaba 1.16 } elsif ($self->{is_xml} and
2328     $self->{ct}->{type} == DOCTYPE_TOKEN and
2329     $self->{nc} == 0x005B) { # [
2330 wakaba 1.12 !!!cp (167.3);
2331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2332     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2333 wakaba 1.13 $self->{in_subset} = 1;
2334 wakaba 1.1 !!!next-input-character;
2335 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2336 wakaba 1.1 redo A;
2337     } else {
2338 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2339    
2340     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2341     !!!cp (180);
2342     $self->{ct}->{quirks} = 1;
2343     $self->{state} = BOGUS_DOCTYPE_STATE;
2344     } else {
2345     !!!cp (180.1);
2346     $self->{state} = BOGUS_MD_STATE;
2347     }
2348 wakaba 1.1
2349     !!!next-input-character;
2350     redo A;
2351     }
2352     } elsif ($self->{state} == PUBLIC_STATE) {
2353     ## ASCII case-insensitive
2354     if ($self->{nc} == [
2355     undef,
2356     0x0055, # U
2357     0x0042, # B
2358     0x004C, # L
2359     0x0049, # I
2360 wakaba 1.12 ]->[length $self->{kwd}] or
2361 wakaba 1.1 $self->{nc} == [
2362     undef,
2363     0x0075, # u
2364     0x0062, # b
2365     0x006C, # l
2366     0x0069, # i
2367 wakaba 1.12 ]->[length $self->{kwd}]) {
2368 wakaba 1.1 !!!cp (175);
2369     ## Stay in the state.
2370 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2371 wakaba 1.1 !!!next-input-character;
2372     redo A;
2373 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2374 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2375     $self->{nc} == 0x0063)) { # c
2376 wakaba 1.12 if ($self->{is_xml} and
2377     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2378     !!!cp (168.1);
2379     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2380     text => 'PUBLIC',
2381     line => $self->{line_prev},
2382     column => $self->{column_prev} - 4);
2383     } else {
2384     !!!cp (168);
2385     }
2386 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2387     !!!next-input-character;
2388     redo A;
2389     } else {
2390 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2391 wakaba 1.1 line => $self->{line_prev},
2392 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2393 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2394     !!!cp (169);
2395     $self->{ct}->{quirks} = 1;
2396     $self->{state} = BOGUS_DOCTYPE_STATE;
2397     } else {
2398     !!!cp (169.1);
2399     $self->{state} = BOGUS_MD_STATE;
2400     }
2401 wakaba 1.1 ## Reconsume.
2402     redo A;
2403     }
2404     } elsif ($self->{state} == SYSTEM_STATE) {
2405     ## ASCII case-insensitive
2406     if ($self->{nc} == [
2407     undef,
2408     0x0059, # Y
2409     0x0053, # S
2410     0x0054, # T
2411     0x0045, # E
2412 wakaba 1.12 ]->[length $self->{kwd}] or
2413 wakaba 1.1 $self->{nc} == [
2414     undef,
2415     0x0079, # y
2416     0x0073, # s
2417     0x0074, # t
2418     0x0065, # e
2419 wakaba 1.12 ]->[length $self->{kwd}]) {
2420 wakaba 1.1 !!!cp (170);
2421     ## Stay in the state.
2422 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2423 wakaba 1.1 !!!next-input-character;
2424     redo A;
2425 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2426 wakaba 1.1 ($self->{nc} == 0x004D or # M
2427     $self->{nc} == 0x006D)) { # m
2428 wakaba 1.12 if ($self->{is_xml} and
2429     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2430     !!!cp (171.1);
2431     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2432     text => 'SYSTEM',
2433     line => $self->{line_prev},
2434     column => $self->{column_prev} - 4);
2435     } else {
2436     !!!cp (171);
2437     }
2438 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2439     !!!next-input-character;
2440     redo A;
2441     } else {
2442 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2443 wakaba 1.1 line => $self->{line_prev},
2444 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2445 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2446     !!!cp (172);
2447     $self->{ct}->{quirks} = 1;
2448     $self->{state} = BOGUS_DOCTYPE_STATE;
2449     } else {
2450     !!!cp (172.1);
2451     $self->{state} = BOGUS_MD_STATE;
2452     }
2453 wakaba 1.1 ## Reconsume.
2454     redo A;
2455     }
2456     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2457     if ($is_space->{$self->{nc}}) {
2458     !!!cp (181);
2459     ## Stay in the state
2460     !!!next-input-character;
2461     redo A;
2462     } elsif ($self->{nc} eq 0x0022) { # "
2463     !!!cp (182);
2464     $self->{ct}->{pubid} = ''; # DOCTYPE
2465     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2466     !!!next-input-character;
2467     redo A;
2468     } elsif ($self->{nc} eq 0x0027) { # '
2469     !!!cp (183);
2470     $self->{ct}->{pubid} = ''; # DOCTYPE
2471     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2472     !!!next-input-character;
2473     redo A;
2474     } elsif ($self->{nc} eq 0x003E) { # >
2475     !!!parse-error (type => 'no PUBLIC literal');
2476 wakaba 1.16
2477     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478     !!!cp (184);
2479     $self->{state} = DATA_STATE;
2480     $self->{s_kwd} = '';
2481     $self->{ct}->{quirks} = 1;
2482     } else {
2483     !!!cp (184.1);
2484     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2485     }
2486    
2487 wakaba 1.1 !!!next-input-character;
2488 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2489 wakaba 1.1 redo A;
2490     } elsif ($self->{nc} == -1) {
2491 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2492     !!!cp (185);
2493     !!!parse-error (type => 'unclosed DOCTYPE');
2494     $self->{state} = DATA_STATE;
2495     $self->{s_kwd} = '';
2496     $self->{ct}->{quirks} = 1;
2497     } else {
2498     !!!cp (185.1);
2499     !!!parse-error (type => 'unclosed md'); ## TODO: type
2500     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2501     }
2502    
2503 wakaba 1.1 ## reconsume
2504     !!!emit ($self->{ct}); # DOCTYPE
2505     redo A;
2506 wakaba 1.16 } elsif ($self->{is_xml} and
2507     $self->{ct}->{type} == DOCTYPE_TOKEN and
2508     $self->{nc} == 0x005B) { # [
2509 wakaba 1.12 !!!cp (186.1);
2510     !!!parse-error (type => 'no PUBLIC literal');
2511     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2512     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2513 wakaba 1.13 $self->{in_subset} = 1;
2514 wakaba 1.12 !!!next-input-character;
2515 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2516 wakaba 1.12 redo A;
2517 wakaba 1.1 } else {
2518     !!!parse-error (type => 'string after PUBLIC');
2519    
2520 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2521     !!!cp (186);
2522     $self->{ct}->{quirks} = 1;
2523     $self->{state} = BOGUS_DOCTYPE_STATE;
2524     } else {
2525     !!!cp (186.2);
2526     $self->{state} = BOGUS_MD_STATE;
2527     }
2528    
2529 wakaba 1.1 !!!next-input-character;
2530     redo A;
2531     }
2532     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2533     if ($self->{nc} == 0x0022) { # "
2534     !!!cp (187);
2535     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2536     !!!next-input-character;
2537     redo A;
2538     } elsif ($self->{nc} == 0x003E) { # >
2539     !!!parse-error (type => 'unclosed PUBLIC literal');
2540    
2541 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2542     !!!cp (188);
2543     $self->{state} = DATA_STATE;
2544     $self->{s_kwd} = '';
2545     $self->{ct}->{quirks} = 1;
2546     } else {
2547     !!!cp (188.1);
2548     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2549     }
2550    
2551 wakaba 1.1 !!!next-input-character;
2552 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2553 wakaba 1.1 redo A;
2554     } elsif ($self->{nc} == -1) {
2555     !!!parse-error (type => 'unclosed PUBLIC literal');
2556    
2557 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2558     !!!cp (189);
2559     $self->{state} = DATA_STATE;
2560     $self->{s_kwd} = '';
2561     $self->{ct}->{quirks} = 1;
2562     } else {
2563     !!!cp (189.1);
2564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2565     }
2566    
2567     ## Reconsume.
2568 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2569     redo A;
2570     } else {
2571     !!!cp (190);
2572 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2573 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2574     length $self->{ct}->{pubid});
2575    
2576     ## Stay in the state
2577     !!!next-input-character;
2578     redo A;
2579     }
2580     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2581     if ($self->{nc} == 0x0027) { # '
2582     !!!cp (191);
2583     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2584     !!!next-input-character;
2585     redo A;
2586     } elsif ($self->{nc} == 0x003E) { # >
2587     !!!parse-error (type => 'unclosed PUBLIC literal');
2588    
2589 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2590     !!!cp (192);
2591     $self->{state} = DATA_STATE;
2592     $self->{s_kwd} = '';
2593     $self->{ct}->{quirks} = 1;
2594     } else {
2595     !!!cp (192.1);
2596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597     }
2598    
2599 wakaba 1.1 !!!next-input-character;
2600 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2601 wakaba 1.1 redo A;
2602     } elsif ($self->{nc} == -1) {
2603     !!!parse-error (type => 'unclosed PUBLIC literal');
2604    
2605 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2606     !!!cp (193);
2607     $self->{state} = DATA_STATE;
2608     $self->{s_kwd} = '';
2609     $self->{ct}->{quirks} = 1;
2610     } else {
2611     !!!cp (193.1);
2612     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2613     }
2614    
2615 wakaba 1.1 ## reconsume
2616 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2617 wakaba 1.1 redo A;
2618     } else {
2619     !!!cp (194);
2620 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2621 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2622     length $self->{ct}->{pubid});
2623    
2624     ## Stay in the state
2625     !!!next-input-character;
2626     redo A;
2627     }
2628     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2629     if ($is_space->{$self->{nc}}) {
2630     !!!cp (195);
2631     ## Stay in the state
2632     !!!next-input-character;
2633     redo A;
2634     } elsif ($self->{nc} == 0x0022) { # "
2635     !!!cp (196);
2636 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2637 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2638     !!!next-input-character;
2639     redo A;
2640     } elsif ($self->{nc} == 0x0027) { # '
2641     !!!cp (197);
2642 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2643 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2644     !!!next-input-character;
2645     redo A;
2646     } elsif ($self->{nc} == 0x003E) { # >
2647 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2648     if ($self->{is_xml}) {
2649     !!!cp (198.1);
2650     !!!parse-error (type => 'no SYSTEM literal');
2651     } else {
2652     !!!cp (198);
2653     }
2654     $self->{state} = DATA_STATE;
2655     $self->{s_kwd} = '';
2656 wakaba 1.12 } else {
2657 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2658     !!!cp (198.2);
2659     } else {
2660     !!!cp (198.3);
2661     !!!parse-error (type => 'no SYSTEM literal');
2662     }
2663     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2664 wakaba 1.12 }
2665 wakaba 1.16
2666 wakaba 1.1 !!!next-input-character;
2667 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2668 wakaba 1.1 redo A;
2669     } elsif ($self->{nc} == -1) {
2670 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2671     !!!cp (199);
2672     !!!parse-error (type => 'unclosed DOCTYPE');
2673    
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     $self->{ct}->{quirks} = 1;
2677     } else {
2678     !!!parse-error (type => 'unclosed md'); ## TODO: type
2679     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2680     }
2681    
2682 wakaba 1.1 ## reconsume
2683 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2684 wakaba 1.1 redo A;
2685 wakaba 1.16 } elsif ($self->{is_xml} and
2686     $self->{ct}->{type} == DOCTYPE_TOKEN and
2687     $self->{nc} == 0x005B) { # [
2688 wakaba 1.12 !!!cp (200.1);
2689     !!!parse-error (type => 'no SYSTEM literal');
2690     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2691     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2692 wakaba 1.13 $self->{in_subset} = 1;
2693 wakaba 1.12 !!!next-input-character;
2694 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2695 wakaba 1.12 redo A;
2696 wakaba 1.1 } else {
2697     !!!parse-error (type => 'string after PUBLIC literal');
2698    
2699 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2700     !!!cp (200);
2701     $self->{ct}->{quirks} = 1;
2702     $self->{state} = BOGUS_DOCTYPE_STATE;
2703     } else {
2704     !!!cp (200.2);
2705     $self->{state} = BOGUS_MD_STATE;
2706     }
2707    
2708 wakaba 1.1 !!!next-input-character;
2709     redo A;
2710     }
2711     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2712     if ($is_space->{$self->{nc}}) {
2713     !!!cp (201);
2714     ## Stay in the state
2715     !!!next-input-character;
2716     redo A;
2717     } elsif ($self->{nc} == 0x0022) { # "
2718     !!!cp (202);
2719     $self->{ct}->{sysid} = ''; # DOCTYPE
2720     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2721     !!!next-input-character;
2722     redo A;
2723     } elsif ($self->{nc} == 0x0027) { # '
2724     !!!cp (203);
2725     $self->{ct}->{sysid} = ''; # DOCTYPE
2726     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2727     !!!next-input-character;
2728     redo A;
2729     } elsif ($self->{nc} == 0x003E) { # >
2730     !!!parse-error (type => 'no SYSTEM literal');
2731     !!!next-input-character;
2732    
2733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734     !!!cp (204);
2735     $self->{state} = DATA_STATE;
2736     $self->{s_kwd} = '';
2737     $self->{ct}->{quirks} = 1;
2738     } else {
2739     !!!cp (204.1);
2740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741     }
2742 wakaba 1.1
2743 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2744 wakaba 1.1 redo A;
2745     } elsif ($self->{nc} == -1) {
2746 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2747     !!!cp (205);
2748     !!!parse-error (type => 'unclosed DOCTYPE');
2749     $self->{state} = DATA_STATE;
2750     $self->{s_kwd} = '';
2751     $self->{ct}->{quirks} = 1;
2752     } else {
2753     !!!cp (205.1);
2754     !!!parse-error (type => 'unclosed md'); ## TODO: type
2755     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2756     }
2757    
2758 wakaba 1.1 ## reconsume
2759 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2760 wakaba 1.1 redo A;
2761 wakaba 1.16 } elsif ($self->{is_xml} and
2762     $self->{ct}->{type} == DOCTYPE_TOKEN and
2763     $self->{nc} == 0x005B) { # [
2764 wakaba 1.12 !!!cp (206.1);
2765     !!!parse-error (type => 'no SYSTEM literal');
2766    
2767     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2768     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2769 wakaba 1.13 $self->{in_subset} = 1;
2770 wakaba 1.12 !!!next-input-character;
2771 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2772 wakaba 1.12 redo A;
2773 wakaba 1.1 } else {
2774     !!!parse-error (type => 'string after SYSTEM');
2775    
2776 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2777     !!!cp (206);
2778     $self->{ct}->{quirks} = 1;
2779     $self->{state} = BOGUS_DOCTYPE_STATE;
2780     } else {
2781     !!!cp (206.2);
2782     $self->{state} = BOGUS_MD_STATE;
2783     }
2784    
2785 wakaba 1.1 !!!next-input-character;
2786     redo A;
2787     }
2788     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2789     if ($self->{nc} == 0x0022) { # "
2790     !!!cp (207);
2791     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2792     !!!next-input-character;
2793     redo A;
2794 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2795 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2796    
2797 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2798     !!!cp (208);
2799     $self->{state} = DATA_STATE;
2800     $self->{s_kwd} = '';
2801     $self->{ct}->{quirks} = 1;
2802     } else {
2803     !!!cp (208.1);
2804     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2805     }
2806    
2807 wakaba 1.1 !!!next-input-character;
2808 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2809 wakaba 1.1 redo A;
2810     } elsif ($self->{nc} == -1) {
2811     !!!parse-error (type => 'unclosed SYSTEM literal');
2812    
2813 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2814     !!!cp (209);
2815     $self->{state} = DATA_STATE;
2816     $self->{s_kwd} = '';
2817     $self->{ct}->{quirks} = 1;
2818     } else {
2819     !!!cp (209.1);
2820     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2821     }
2822    
2823 wakaba 1.1 ## reconsume
2824 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2825 wakaba 1.1 redo A;
2826     } else {
2827     !!!cp (210);
2828 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2829 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2830     length $self->{ct}->{sysid});
2831    
2832     ## Stay in the state
2833     !!!next-input-character;
2834     redo A;
2835     }
2836     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2837     if ($self->{nc} == 0x0027) { # '
2838     !!!cp (211);
2839     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2840     !!!next-input-character;
2841     redo A;
2842 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2843 wakaba 1.1 !!!cp (212);
2844     !!!parse-error (type => 'unclosed SYSTEM literal');
2845    
2846     $self->{state} = DATA_STATE;
2847 wakaba 1.5 $self->{s_kwd} = '';
2848 wakaba 1.1 !!!next-input-character;
2849    
2850     $self->{ct}->{quirks} = 1;
2851     !!!emit ($self->{ct}); # DOCTYPE
2852    
2853     redo A;
2854     } elsif ($self->{nc} == -1) {
2855     !!!parse-error (type => 'unclosed SYSTEM literal');
2856    
2857 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2858     !!!cp (213);
2859     $self->{state} = DATA_STATE;
2860     $self->{s_kwd} = '';
2861     $self->{ct}->{quirks} = 1;
2862     } else {
2863     !!!cp (213.1);
2864     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2865     }
2866    
2867 wakaba 1.1 ## reconsume
2868 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2869 wakaba 1.1 redo A;
2870     } else {
2871     !!!cp (214);
2872 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2873 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2874     length $self->{ct}->{sysid});
2875    
2876     ## Stay in the state
2877     !!!next-input-character;
2878     redo A;
2879     }
2880     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2881     if ($is_space->{$self->{nc}}) {
2882 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2883     !!!cp (215.1);
2884     $self->{state} = BEFORE_NDATA_STATE;
2885     } else {
2886     !!!cp (215);
2887     ## Stay in the state
2888     }
2889 wakaba 1.1 !!!next-input-character;
2890     redo A;
2891     } elsif ($self->{nc} == 0x003E) { # >
2892 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2893     !!!cp (216);
2894     $self->{state} = DATA_STATE;
2895     $self->{s_kwd} = '';
2896     } else {
2897     !!!cp (216.1);
2898     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2899     }
2900    
2901 wakaba 1.1 !!!next-input-character;
2902 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2903 wakaba 1.1 redo A;
2904 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2905     ($self->{nc} == 0x004E or # N
2906     $self->{nc} == 0x006E)) { # n
2907     !!!cp (216.2);
2908     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2909     $self->{state} = NDATA_STATE;
2910     $self->{kwd} = chr $self->{nc};
2911     !!!next-input-character;
2912     redo A;
2913 wakaba 1.1 } elsif ($self->{nc} == -1) {
2914 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2915     !!!cp (217);
2916     !!!parse-error (type => 'unclosed DOCTYPE');
2917     $self->{state} = DATA_STATE;
2918     $self->{s_kwd} = '';
2919     $self->{ct}->{quirks} = 1;
2920     } else {
2921     !!!cp (217.1);
2922     !!!parse-error (type => 'unclosed md'); ## TODO: type
2923     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2924     }
2925    
2926 wakaba 1.1 ## reconsume
2927 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2928 wakaba 1.1 redo A;
2929 wakaba 1.16 } elsif ($self->{is_xml} and
2930     $self->{ct}->{type} == DOCTYPE_TOKEN and
2931     $self->{nc} == 0x005B) { # [
2932 wakaba 1.12 !!!cp (218.1);
2933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2935 wakaba 1.13 $self->{in_subset} = 1;
2936 wakaba 1.12 !!!next-input-character;
2937 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2938 wakaba 1.12 redo A;
2939 wakaba 1.1 } else {
2940     !!!parse-error (type => 'string after SYSTEM literal');
2941    
2942 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2943     !!!cp (218);
2944     #$self->{ct}->{quirks} = 1;
2945     $self->{state} = BOGUS_DOCTYPE_STATE;
2946     } else {
2947     !!!cp (218.2);
2948     $self->{state} = BOGUS_MD_STATE;
2949     }
2950    
2951 wakaba 1.1 !!!next-input-character;
2952     redo A;
2953     }
2954 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2955     if ($is_space->{$self->{nc}}) {
2956     !!!cp (218.3);
2957     ## Stay in the state.
2958     !!!next-input-character;
2959     redo A;
2960     } elsif ($self->{nc} == 0x003E) { # >
2961     !!!cp (218.4);
2962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2963     !!!next-input-character;
2964     !!!emit ($self->{ct}); # ENTITY
2965     redo A;
2966     } elsif ($self->{nc} == 0x004E or # N
2967     $self->{nc} == 0x006E) { # n
2968     !!!cp (218.5);
2969     $self->{state} = NDATA_STATE;
2970     $self->{kwd} = chr $self->{nc};
2971     !!!next-input-character;
2972     redo A;
2973     } elsif ($self->{nc} == -1) {
2974     !!!cp (218.6);
2975     !!!parse-error (type => 'unclosed md'); ## TODO: type
2976     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2977     ## reconsume
2978     !!!emit ($self->{ct}); # ENTITY
2979     redo A;
2980     } else {
2981     !!!cp (218.7);
2982     !!!parse-error (type => 'string after SYSTEM literal');
2983     $self->{state} = BOGUS_MD_STATE;
2984     !!!next-input-character;
2985     redo A;
2986     }
2987 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2988     if ($self->{nc} == 0x003E) { # >
2989     !!!cp (219);
2990     $self->{state} = DATA_STATE;
2991 wakaba 1.5 $self->{s_kwd} = '';
2992 wakaba 1.1 !!!next-input-character;
2993    
2994     !!!emit ($self->{ct}); # DOCTYPE
2995    
2996     redo A;
2997 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2998 wakaba 1.13 !!!cp (220.1);
2999     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3000     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3001     $self->{in_subset} = 1;
3002     !!!next-input-character;
3003     !!!emit ($self->{ct}); # DOCTYPE
3004     redo A;
3005 wakaba 1.1 } elsif ($self->{nc} == -1) {
3006     !!!cp (220);
3007     $self->{state} = DATA_STATE;
3008 wakaba 1.5 $self->{s_kwd} = '';
3009 wakaba 1.1 ## reconsume
3010    
3011     !!!emit ($self->{ct}); # DOCTYPE
3012    
3013     redo A;
3014     } else {
3015     !!!cp (221);
3016     my $s = '';
3017 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3018 wakaba 1.1
3019     ## Stay in the state
3020     !!!next-input-character;
3021     redo A;
3022     }
3023     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3024     ## NOTE: "CDATA section state" in the state is jointly implemented
3025     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3026     ## and |CDATA_SECTION_MSE2_STATE|.
3027 wakaba 1.10
3028     ## XML5: "CDATA state".
3029 wakaba 1.1
3030     if ($self->{nc} == 0x005D) { # ]
3031     !!!cp (221.1);
3032     $self->{state} = CDATA_SECTION_MSE1_STATE;
3033     !!!next-input-character;
3034     redo A;
3035     } elsif ($self->{nc} == -1) {
3036 wakaba 1.6 if ($self->{is_xml}) {
3037 wakaba 1.8 !!!cp (221.11);
3038 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3039 wakaba 1.8 } else {
3040     !!!cp (221.12);
3041 wakaba 1.6 }
3042    
3043 wakaba 1.1 $self->{state} = DATA_STATE;
3044 wakaba 1.5 $self->{s_kwd} = '';
3045 wakaba 1.10 ## Reconsume.
3046 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3047     !!!cp (221.2);
3048     !!!emit ($self->{ct}); # character
3049     } else {
3050     !!!cp (221.3);
3051     ## No token to emit. $self->{ct} is discarded.
3052     }
3053     redo A;
3054     } else {
3055     !!!cp (221.4);
3056     $self->{ct}->{data} .= chr $self->{nc};
3057     $self->{read_until}->($self->{ct}->{data},
3058     q<]>,
3059     length $self->{ct}->{data});
3060    
3061     ## Stay in the state.
3062     !!!next-input-character;
3063     redo A;
3064     }
3065    
3066     ## ISSUE: "text tokens" in spec.
3067     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3068 wakaba 1.10 ## XML5: "CDATA bracket state".
3069    
3070 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3071     !!!cp (221.5);
3072     $self->{state} = CDATA_SECTION_MSE2_STATE;
3073     !!!next-input-character;
3074     redo A;
3075     } else {
3076     !!!cp (221.6);
3077 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3078 wakaba 1.1 $self->{ct}->{data} .= ']';
3079 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3080 wakaba 1.1 ## Reconsume.
3081     redo A;
3082     }
3083     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3084 wakaba 1.10 ## XML5: "CDATA end state".
3085    
3086 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3087     $self->{state} = DATA_STATE;
3088 wakaba 1.5 $self->{s_kwd} = '';
3089 wakaba 1.1 !!!next-input-character;
3090     if (length $self->{ct}->{data}) { # character
3091     !!!cp (221.7);
3092     !!!emit ($self->{ct}); # character
3093     } else {
3094     !!!cp (221.8);
3095     ## No token to emit. $self->{ct} is discarded.
3096     }
3097     redo A;
3098     } elsif ($self->{nc} == 0x005D) { # ]
3099     !!!cp (221.9); # character
3100     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3101     ## Stay in the state.
3102     !!!next-input-character;
3103     redo A;
3104     } else {
3105     !!!cp (221.11);
3106     $self->{ct}->{data} .= ']]'; # character
3107     $self->{state} = CDATA_SECTION_STATE;
3108 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3109 wakaba 1.1 redo A;
3110     }
3111     } elsif ($self->{state} == ENTITY_STATE) {
3112     if ($is_space->{$self->{nc}} or
3113     {
3114     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3115     $self->{entity_add} => 1,
3116     }->{$self->{nc}}) {
3117 wakaba 1.22 if ($self->{is_xml}) {
3118     !!!cp (1001.1);
3119     !!!parse-error (type => 'bare ero',
3120     line => $self->{line_prev},
3121     column => $self->{column_prev}
3122     + ($self->{nc} == -1 ? 1 : 0));
3123     } else {
3124     !!!cp (1001);
3125     ## No error
3126     }
3127 wakaba 1.1 ## Don't consume
3128     ## Return nothing.
3129     #
3130     } elsif ($self->{nc} == 0x0023) { # #
3131     !!!cp (999);
3132     $self->{state} = ENTITY_HASH_STATE;
3133 wakaba 1.12 $self->{kwd} = '#';
3134 wakaba 1.1 !!!next-input-character;
3135     redo A;
3136 wakaba 1.22 } elsif ($self->{is_xml} or
3137     (0x0041 <= $self->{nc} and
3138 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3139     (0x0061 <= $self->{nc} and
3140     $self->{nc} <= 0x007A)) { # a..z
3141     !!!cp (998);
3142     require Whatpm::_NamedEntityList;
3143     $self->{state} = ENTITY_NAME_STATE;
3144 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3145     $self->{entity__value} = $self->{kwd};
3146 wakaba 1.1 $self->{entity__match} = 0;
3147     !!!next-input-character;
3148     redo A;
3149     } else {
3150     !!!cp (1027);
3151     !!!parse-error (type => 'bare ero');
3152     ## Return nothing.
3153     #
3154     }
3155    
3156     ## NOTE: No character is consumed by the "consume a character
3157     ## reference" algorithm. In other word, there is an "&" character
3158     ## that does not introduce a character reference, which would be
3159     ## appended to the parent element or the attribute value in later
3160     ## process of the tokenizer.
3161    
3162     if ($self->{prev_state} == DATA_STATE) {
3163     !!!cp (997);
3164     $self->{state} = $self->{prev_state};
3165 wakaba 1.5 $self->{s_kwd} = '';
3166 wakaba 1.1 ## Reconsume.
3167     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3168     line => $self->{line_prev},
3169     column => $self->{column_prev},
3170     });
3171     redo A;
3172     } else {
3173     !!!cp (996);
3174     $self->{ca}->{value} .= '&';
3175     $self->{state} = $self->{prev_state};
3176 wakaba 1.5 $self->{s_kwd} = '';
3177 wakaba 1.1 ## Reconsume.
3178     redo A;
3179     }
3180     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3181 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3182 wakaba 1.1 !!!cp (995);
3183     $self->{state} = HEXREF_X_STATE;
3184 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3185 wakaba 1.1 !!!next-input-character;
3186     redo A;
3187 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3188     !!!cp (995.1);
3189     if ($self->{is_xml}) {
3190     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3191     }
3192     $self->{state} = HEXREF_X_STATE;
3193     $self->{kwd} .= chr $self->{nc};
3194     !!!next-input-character;
3195     redo A;
3196 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3197     $self->{nc} <= 0x0039) { # 0..9
3198     !!!cp (994);
3199     $self->{state} = NCR_NUM_STATE;
3200 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3201 wakaba 1.1 !!!next-input-character;
3202     redo A;
3203     } else {
3204     !!!parse-error (type => 'bare nero',
3205     line => $self->{line_prev},
3206     column => $self->{column_prev} - 1);
3207    
3208     ## NOTE: According to the spec algorithm, nothing is returned,
3209     ## and then "&#" is appended to the parent element or the attribute
3210     ## value in the later processing.
3211    
3212     if ($self->{prev_state} == DATA_STATE) {
3213     !!!cp (1019);
3214     $self->{state} = $self->{prev_state};
3215 wakaba 1.5 $self->{s_kwd} = '';
3216 wakaba 1.1 ## Reconsume.
3217     !!!emit ({type => CHARACTER_TOKEN,
3218     data => '&#',
3219     line => $self->{line_prev},
3220     column => $self->{column_prev} - 1,
3221     });
3222     redo A;
3223     } else {
3224     !!!cp (993);
3225     $self->{ca}->{value} .= '&#';
3226     $self->{state} = $self->{prev_state};
3227 wakaba 1.5 $self->{s_kwd} = '';
3228 wakaba 1.1 ## Reconsume.
3229     redo A;
3230     }
3231     }
3232     } elsif ($self->{state} == NCR_NUM_STATE) {
3233     if (0x0030 <= $self->{nc} and
3234     $self->{nc} <= 0x0039) { # 0..9
3235     !!!cp (1012);
3236 wakaba 1.12 $self->{kwd} *= 10;
3237     $self->{kwd} += $self->{nc} - 0x0030;
3238 wakaba 1.1
3239     ## Stay in the state.
3240     !!!next-input-character;
3241     redo A;
3242     } elsif ($self->{nc} == 0x003B) { # ;
3243     !!!cp (1013);
3244     !!!next-input-character;
3245     #
3246     } else {
3247     !!!cp (1014);
3248     !!!parse-error (type => 'no refc');
3249     ## Reconsume.
3250     #
3251     }
3252    
3253 wakaba 1.12 my $code = $self->{kwd};
3254 wakaba 1.1 my $l = $self->{line_prev};
3255     my $c = $self->{column_prev};
3256 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3257     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3258     ($self->{is_xml} and $code == 0x0000)) {
3259 wakaba 1.1 !!!cp (1015);
3260     !!!parse-error (type => 'invalid character reference',
3261     text => (sprintf 'U+%04X', $code),
3262     line => $l, column => $c);
3263     $code = $charref_map->{$code};
3264     } elsif ($code > 0x10FFFF) {
3265     !!!cp (1016);
3266     !!!parse-error (type => 'invalid character reference',
3267     text => (sprintf 'U-%08X', $code),
3268     line => $l, column => $c);
3269     $code = 0xFFFD;
3270     }
3271    
3272     if ($self->{prev_state} == DATA_STATE) {
3273     !!!cp (992);
3274     $self->{state} = $self->{prev_state};
3275 wakaba 1.5 $self->{s_kwd} = '';
3276 wakaba 1.1 ## Reconsume.
3277     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3278 wakaba 1.7 has_reference => 1,
3279 wakaba 1.1 line => $l, column => $c,
3280     });
3281     redo A;
3282     } else {
3283     !!!cp (991);
3284     $self->{ca}->{value} .= chr $code;
3285     $self->{ca}->{has_reference} = 1;
3286     $self->{state} = $self->{prev_state};
3287 wakaba 1.5 $self->{s_kwd} = '';
3288 wakaba 1.1 ## Reconsume.
3289     redo A;
3290     }
3291     } elsif ($self->{state} == HEXREF_X_STATE) {
3292     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3293     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3294     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3295     # 0..9, A..F, a..f
3296     !!!cp (990);
3297     $self->{state} = HEXREF_HEX_STATE;
3298 wakaba 1.12 $self->{kwd} = 0;
3299 wakaba 1.1 ## Reconsume.
3300     redo A;
3301     } else {
3302     !!!parse-error (type => 'bare hcro',
3303     line => $self->{line_prev},
3304     column => $self->{column_prev} - 2);
3305    
3306     ## NOTE: According to the spec algorithm, nothing is returned,
3307     ## and then "&#" followed by "X" or "x" is appended to the parent
3308     ## element or the attribute value in the later processing.
3309    
3310     if ($self->{prev_state} == DATA_STATE) {
3311     !!!cp (1005);
3312     $self->{state} = $self->{prev_state};
3313 wakaba 1.5 $self->{s_kwd} = '';
3314 wakaba 1.1 ## Reconsume.
3315     !!!emit ({type => CHARACTER_TOKEN,
3316 wakaba 1.12 data => '&' . $self->{kwd},
3317 wakaba 1.1 line => $self->{line_prev},
3318 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3319 wakaba 1.1 });
3320     redo A;
3321     } else {
3322     !!!cp (989);
3323 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3324 wakaba 1.1 $self->{state} = $self->{prev_state};
3325 wakaba 1.5 $self->{s_kwd} = '';
3326 wakaba 1.1 ## Reconsume.
3327     redo A;
3328     }
3329     }
3330     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3331     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3332     # 0..9
3333     !!!cp (1002);
3334 wakaba 1.12 $self->{kwd} *= 0x10;
3335     $self->{kwd} += $self->{nc} - 0x0030;
3336 wakaba 1.1 ## Stay in the state.
3337     !!!next-input-character;
3338     redo A;
3339     } elsif (0x0061 <= $self->{nc} and
3340     $self->{nc} <= 0x0066) { # a..f
3341     !!!cp (1003);
3342 wakaba 1.12 $self->{kwd} *= 0x10;
3343     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3344 wakaba 1.1 ## Stay in the state.
3345     !!!next-input-character;
3346     redo A;
3347     } elsif (0x0041 <= $self->{nc} and
3348     $self->{nc} <= 0x0046) { # A..F
3349     !!!cp (1004);
3350 wakaba 1.12 $self->{kwd} *= 0x10;
3351     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3352 wakaba 1.1 ## Stay in the state.
3353     !!!next-input-character;
3354     redo A;
3355     } elsif ($self->{nc} == 0x003B) { # ;
3356     !!!cp (1006);
3357     !!!next-input-character;
3358     #
3359     } else {
3360     !!!cp (1007);
3361     !!!parse-error (type => 'no refc',
3362     line => $self->{line},
3363     column => $self->{column});
3364     ## Reconsume.
3365     #
3366     }
3367    
3368 wakaba 1.12 my $code = $self->{kwd};
3369 wakaba 1.1 my $l = $self->{line_prev};
3370     my $c = $self->{column_prev};
3371 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3372     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3373     ($self->{is_xml} and $code == 0x0000)) {
3374 wakaba 1.1 !!!cp (1008);
3375     !!!parse-error (type => 'invalid character reference',
3376     text => (sprintf 'U+%04X', $code),
3377     line => $l, column => $c);
3378     $code = $charref_map->{$code};
3379     } elsif ($code > 0x10FFFF) {
3380     !!!cp (1009);
3381     !!!parse-error (type => 'invalid character reference',
3382     text => (sprintf 'U-%08X', $code),
3383     line => $l, column => $c);
3384     $code = 0xFFFD;
3385     }
3386    
3387     if ($self->{prev_state} == DATA_STATE) {
3388     !!!cp (988);
3389     $self->{state} = $self->{prev_state};
3390 wakaba 1.5 $self->{s_kwd} = '';
3391 wakaba 1.1 ## Reconsume.
3392     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3393 wakaba 1.7 has_reference => 1,
3394 wakaba 1.1 line => $l, column => $c,
3395     });
3396     redo A;
3397     } else {
3398     !!!cp (987);
3399     $self->{ca}->{value} .= chr $code;
3400     $self->{ca}->{has_reference} = 1;
3401     $self->{state} = $self->{prev_state};
3402 wakaba 1.5 $self->{s_kwd} = '';
3403 wakaba 1.1 ## Reconsume.
3404     redo A;
3405     }
3406     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3407 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3408     $self->{nc} <= 0x005A) or # x
3409     (0x0061 <= $self->{nc} and # a
3410     $self->{nc} <= 0x007A) or # z
3411     (0x0030 <= $self->{nc} and # 0
3412     $self->{nc} <= 0x0039) or # 9
3413 wakaba 1.22 $self->{nc} == 0x003B or # ;
3414     ($self->{is_xml} and
3415     not ($is_space->{$self->{nc}} or
3416     {
3417     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3418     $self->{entity_add} => 1,
3419     }->{$self->{nc}}))) {
3420 wakaba 1.1 our $EntityChar;
3421 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3422 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3423     $self->{ge}->{$self->{kwd}}) {
3424 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3425 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3426     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3427     !!!cp (1020.1);
3428     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3429     } else {
3430     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3431     !!!cp (1020.2);
3432     !!!parse-error (type => 'unparsed entity', ## TODO: type
3433     value => $self->{kwd});
3434     } else {
3435     !!!cp (1020.3);
3436     }
3437     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3438     }
3439     } else {
3440     if ($self->{is_xml}) {
3441     !!!cp (1020.4);
3442     !!!parse-error (type => 'entity not declared', ## TODO: type
3443     value => $self->{kwd},
3444     level => {
3445     'amp;' => $self->{level}->{warn},
3446     'quot;' => $self->{level}->{warn},
3447     'lt;' => $self->{level}->{warn},
3448     'gt;' => $self->{level}->{warn},
3449     'apos;' => $self->{level}->{warn},
3450     }->{$self->{kwd}} ||
3451     $self->{level}->{must});
3452     } else {
3453     !!!cp (1020);
3454     }
3455     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3456     }
3457 wakaba 1.1 $self->{entity__match} = 1;
3458     !!!next-input-character;
3459     #
3460     } else {
3461     !!!cp (1021);
3462 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3463 wakaba 1.1 $self->{entity__match} = -1;
3464     ## Stay in the state.
3465     !!!next-input-character;
3466     redo A;
3467     }
3468     } else {
3469     !!!cp (1022);
3470     $self->{entity__value} .= chr $self->{nc};
3471     $self->{entity__match} *= 2;
3472     ## Stay in the state.
3473     !!!next-input-character;
3474     redo A;
3475     }
3476     }
3477    
3478     my $data;
3479     my $has_ref;
3480     if ($self->{entity__match} > 0) {
3481     !!!cp (1023);
3482     $data = $self->{entity__value};
3483     $has_ref = 1;
3484     #
3485     } elsif ($self->{entity__match} < 0) {
3486     !!!parse-error (type => 'no refc');
3487     if ($self->{prev_state} != DATA_STATE and # in attribute
3488     $self->{entity__match} < -1) {
3489     !!!cp (1024);
3490 wakaba 1.12 $data = '&' . $self->{kwd};
3491 wakaba 1.1 #
3492     } else {
3493     !!!cp (1025);
3494     $data = $self->{entity__value};
3495     $has_ref = 1;
3496     #
3497     }
3498     } else {
3499     !!!cp (1026);
3500     !!!parse-error (type => 'bare ero',
3501     line => $self->{line_prev},
3502 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3503     $data = '&' . $self->{kwd};
3504 wakaba 1.1 #
3505     }
3506    
3507     ## NOTE: In these cases, when a character reference is found,
3508     ## it is consumed and a character token is returned, or, otherwise,
3509     ## nothing is consumed and returned, according to the spec algorithm.
3510     ## In this implementation, anything that has been examined by the
3511     ## tokenizer is appended to the parent element or the attribute value
3512     ## as string, either literal string when no character reference or
3513     ## entity-replaced string otherwise, in this stage, since any characters
3514     ## that would not be consumed are appended in the data state or in an
3515     ## appropriate attribute value state anyway.
3516    
3517     if ($self->{prev_state} == DATA_STATE) {
3518     !!!cp (986);
3519     $self->{state} = $self->{prev_state};
3520 wakaba 1.5 $self->{s_kwd} = '';
3521 wakaba 1.1 ## Reconsume.
3522     !!!emit ({type => CHARACTER_TOKEN,
3523     data => $data,
3524 wakaba 1.7 has_reference => $has_ref,
3525 wakaba 1.1 line => $self->{line_prev},
3526 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3527 wakaba 1.1 });
3528     redo A;
3529     } else {
3530     !!!cp (985);
3531     $self->{ca}->{value} .= $data;
3532     $self->{ca}->{has_reference} = 1 if $has_ref;
3533     $self->{state} = $self->{prev_state};
3534 wakaba 1.5 $self->{s_kwd} = '';
3535 wakaba 1.1 ## Reconsume.
3536     redo A;
3537     }
3538 wakaba 1.8
3539     ## XML-only states
3540    
3541     } elsif ($self->{state} == PI_STATE) {
3542 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3543    
3544 wakaba 1.8 if ($is_space->{$self->{nc}} or
3545 wakaba 1.14 $self->{nc} == 0x003F or # ?
3546 wakaba 1.8 $self->{nc} == -1) {
3547 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3548     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3549     ## "DOCTYPE pi state": Parse error, switch to the "data
3550     ## state".
3551 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3552     line => $self->{line_prev},
3553     column => $self->{column_prev}
3554     - 1 * ($self->{nc} != -1));
3555     $self->{state} = BOGUS_COMMENT_STATE;
3556     ## Reconsume.
3557     $self->{ct} = {type => COMMENT_TOKEN,
3558     data => '?',
3559     line => $self->{line_prev},
3560     column => $self->{column_prev}
3561     - 1 * ($self->{nc} != -1),
3562     };
3563     redo A;
3564     } else {
3565 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3566 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3567     target => chr $self->{nc},
3568     data => '',
3569     line => $self->{line_prev},
3570     column => $self->{column_prev} - 1,
3571     };
3572     $self->{state} = PI_TARGET_STATE;
3573     !!!next-input-character;
3574     redo A;
3575     }
3576     } elsif ($self->{state} == PI_TARGET_STATE) {
3577     if ($is_space->{$self->{nc}}) {
3578     $self->{state} = PI_TARGET_AFTER_STATE;
3579     !!!next-input-character;
3580     redo A;
3581     } elsif ($self->{nc} == -1) {
3582     !!!parse-error (type => 'no pic'); ## TODO: type
3583 wakaba 1.13 if ($self->{in_subset}) {
3584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585     } else {
3586     $self->{state} = DATA_STATE;
3587     $self->{s_kwd} = '';
3588     }
3589 wakaba 1.8 ## Reconsume.
3590     !!!emit ($self->{ct}); # pi
3591     redo A;
3592     } elsif ($self->{nc} == 0x003F) { # ?
3593     $self->{state} = PI_AFTER_STATE;
3594     !!!next-input-character;
3595     redo A;
3596     } else {
3597     ## XML5: typo ("tag name" -> "target")
3598     $self->{ct}->{target} .= chr $self->{nc}; # pi
3599     !!!next-input-character;
3600     redo A;
3601     }
3602     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3603     if ($is_space->{$self->{nc}}) {
3604     ## Stay in the state.
3605     !!!next-input-character;
3606     redo A;
3607     } else {
3608     $self->{state} = PI_DATA_STATE;
3609     ## Reprocess.
3610     redo A;
3611     }
3612     } elsif ($self->{state} == PI_DATA_STATE) {
3613     if ($self->{nc} == 0x003F) { # ?
3614     $self->{state} = PI_DATA_AFTER_STATE;
3615     !!!next-input-character;
3616     redo A;
3617     } elsif ($self->{nc} == -1) {
3618     !!!parse-error (type => 'no pic'); ## TODO: type
3619 wakaba 1.13 if ($self->{in_subset}) {
3620 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3621 wakaba 1.13 } else {
3622     $self->{state} = DATA_STATE;
3623     $self->{s_kwd} = '';
3624     }
3625 wakaba 1.8 ## Reprocess.
3626     !!!emit ($self->{ct}); # pi
3627     redo A;
3628     } else {
3629     $self->{ct}->{data} .= chr $self->{nc}; # pi
3630     $self->{read_until}->($self->{ct}->{data}, q[?],
3631     length $self->{ct}->{data});
3632     ## Stay in the state.
3633     !!!next-input-character;
3634     ## Reprocess.
3635     redo A;
3636     }
3637     } elsif ($self->{state} == PI_AFTER_STATE) {
3638 wakaba 1.14 ## XML5: Part of "Pi after state".
3639    
3640 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3641 wakaba 1.13 if ($self->{in_subset}) {
3642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3643     } else {
3644     $self->{state} = DATA_STATE;
3645     $self->{s_kwd} = '';
3646     }
3647 wakaba 1.8 !!!next-input-character;
3648     !!!emit ($self->{ct}); # pi
3649     redo A;
3650     } elsif ($self->{nc} == 0x003F) { # ?
3651     !!!parse-error (type => 'no s after target', ## TODO: type
3652     line => $self->{line_prev},
3653     column => $self->{column_prev}); ## XML5: no error
3654     $self->{ct}->{data} .= '?';
3655     $self->{state} = PI_DATA_AFTER_STATE;
3656     !!!next-input-character;
3657     redo A;
3658     } else {
3659     !!!parse-error (type => 'no s after target', ## TODO: type
3660     line => $self->{line_prev},
3661     column => $self->{column_prev}
3662     + 1 * ($self->{nc} == -1)); ## XML5: no error
3663     $self->{ct}->{data} .= '?'; ## XML5: not appended
3664     $self->{state} = PI_DATA_STATE;
3665     ## Reprocess.
3666     redo A;
3667     }
3668     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3669 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3670    
3671 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3672 wakaba 1.13 if ($self->{in_subset}) {
3673     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3674     } else {
3675     $self->{state} = DATA_STATE;
3676     $self->{s_kwd} = '';
3677     }
3678 wakaba 1.8 !!!next-input-character;
3679     !!!emit ($self->{ct}); # pi
3680     redo A;
3681     } elsif ($self->{nc} == 0x003F) { # ?
3682     $self->{ct}->{data} .= '?';
3683     ## Stay in the state.
3684     !!!next-input-character;
3685     redo A;
3686     } else {
3687     $self->{ct}->{data} .= '?'; ## XML5: not appended
3688     $self->{state} = PI_DATA_STATE;
3689     ## Reprocess.
3690     redo A;
3691     }
3692 wakaba 1.12
3693     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3694     if ($self->{nc} == 0x003C) { # <
3695 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3696 wakaba 1.12 !!!next-input-character;
3697     redo A;
3698     } elsif ($self->{nc} == 0x0025) { # %
3699     ## XML5: Not defined yet.
3700    
3701     ## TODO:
3702 wakaba 1.24
3703     if (not $self->{stop_processing} and
3704     not $self->{document}->xml_standalone) {
3705     !!!parse-error (type => 'stop processing', ## TODO: type
3706     level => $self->{level}->{info});
3707     $self->{stop_processing} = 1;
3708     }
3709    
3710 wakaba 1.12 !!!next-input-character;
3711     redo A;
3712     } elsif ($self->{nc} == 0x005D) { # ]
3713 wakaba 1.13 delete $self->{in_subset};
3714 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3715     !!!next-input-character;
3716     redo A;
3717     } elsif ($is_space->{$self->{nc}}) {
3718     ## Stay in the state.
3719     !!!next-input-character;
3720     redo A;
3721     } elsif ($self->{nc} == -1) {
3722     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3723 wakaba 1.13 delete $self->{in_subset};
3724 wakaba 1.12 $self->{state} = DATA_STATE;
3725     $self->{s_kwd} = '';
3726     ## Reconsume.
3727 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3728 wakaba 1.12 redo A;
3729     } else {
3730     unless ($self->{internal_subset_tainted}) {
3731     ## XML5: No parse error.
3732     !!!parse-error (type => 'string in internal subset');
3733     $self->{internal_subset_tainted} = 1;
3734     }
3735     ## Stay in the state.
3736     !!!next-input-character;
3737     redo A;
3738     }
3739     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3740     if ($self->{nc} == 0x003E) { # >
3741     $self->{state} = DATA_STATE;
3742     $self->{s_kwd} = '';
3743     !!!next-input-character;
3744 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3745 wakaba 1.12 redo A;
3746     } elsif ($self->{nc} == -1) {
3747     !!!parse-error (type => 'unclosed DOCTYPE');
3748     $self->{state} = DATA_STATE;
3749     $self->{s_kwd} = '';
3750     ## Reconsume.
3751 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3752 wakaba 1.12 redo A;
3753     } else {
3754     ## XML5: No parse error and stay in the state.
3755     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3756    
3757 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3758     !!!next-input-character;
3759     redo A;
3760     }
3761     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3762     if ($self->{nc} == 0x003E) { # >
3763     $self->{state} = DATA_STATE;
3764     $self->{s_kwd} = '';
3765     !!!next-input-character;
3766     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3767     redo A;
3768     } elsif ($self->{nc} == -1) {
3769     $self->{state} = DATA_STATE;
3770     $self->{s_kwd} = '';
3771     ## Reconsume.
3772     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3773     redo A;
3774     } else {
3775     ## Stay in the state.
3776     !!!next-input-character;
3777     redo A;
3778     }
3779     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3780     if ($self->{nc} == 0x0021) { # !
3781 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3782 wakaba 1.13 !!!next-input-character;
3783     redo A;
3784     } elsif ($self->{nc} == 0x003F) { # ?
3785     $self->{state} = PI_STATE;
3786     !!!next-input-character;
3787     redo A;
3788     } elsif ($self->{nc} == -1) {
3789     !!!parse-error (type => 'bare stago');
3790     $self->{state} = DATA_STATE;
3791     $self->{s_kwd} = '';
3792     ## Reconsume.
3793     redo A;
3794     } else {
3795     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3796     line => $self->{line_prev},
3797     column => $self->{column_prev});
3798     $self->{state} = BOGUS_COMMENT_STATE;
3799     $self->{ct} = {type => COMMENT_TOKEN,
3800     data => '',
3801     }; ## NOTE: Will be discarded.
3802 wakaba 1.12 !!!next-input-character;
3803     redo A;
3804     }
3805 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3806     ## XML5: "DOCTYPE markup declaration state".
3807    
3808     if ($self->{nc} == 0x002D) { # -
3809     $self->{state} = MD_HYPHEN_STATE;
3810     !!!next-input-character;
3811     redo A;
3812 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3813     $self->{nc} == 0x0065) { # e
3814 wakaba 1.14 $self->{state} = MD_E_STATE;
3815     $self->{kwd} = chr $self->{nc};
3816     !!!next-input-character;
3817     redo A;
3818 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3819     $self->{nc} == 0x0061) { # a
3820 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3821     $self->{kwd} = chr $self->{nc};
3822     !!!next-input-character;
3823     redo A;
3824 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3825     $self->{nc} == 0x006E) { # n
3826 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3827     $self->{kwd} = chr $self->{nc};
3828     !!!next-input-character;
3829     redo A;
3830     } else {
3831     #
3832     }
3833    
3834     ## XML5: No parse error.
3835     !!!parse-error (type => 'bogus comment',
3836     line => $self->{line_prev},
3837     column => $self->{column_prev} - 1);
3838     ## Reconsume.
3839     $self->{state} = BOGUS_COMMENT_STATE;
3840     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3841     redo A;
3842     } elsif ($self->{state} == MD_E_STATE) {
3843 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3844     $self->{nc} == 0x006E) { # n
3845 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3846     $self->{kwd} .= chr $self->{nc};
3847     !!!next-input-character;
3848     redo A;
3849 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3850     $self->{nc} == 0x006C) { # l
3851 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3852     $self->{state} = MD_ELEMENT_STATE;
3853     $self->{kwd} .= chr $self->{nc};
3854     !!!next-input-character;
3855     redo A;
3856     } else {
3857     ## XML5: No parse error.
3858     !!!parse-error (type => 'bogus comment',
3859     line => $self->{line_prev},
3860     column => $self->{column_prev} - 2
3861     + 1 * ($self->{nc} == -1));
3862     ## Reconsume.
3863     $self->{state} = BOGUS_COMMENT_STATE;
3864     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3865     redo A;
3866     }
3867     } elsif ($self->{state} == MD_ENTITY_STATE) {
3868 wakaba 1.17 if ($self->{nc} == [
3869     undef,
3870     undef,
3871     0x0054, # T
3872     0x0049, # I
3873     0x0054, # T
3874     ]->[length $self->{kwd}] or
3875     $self->{nc} == [
3876     undef,
3877     undef,
3878     0x0074, # t
3879     0x0069, # i
3880     0x0074, # t
3881     ]->[length $self->{kwd}]) {
3882 wakaba 1.14 ## Stay in the state.
3883     $self->{kwd} .= chr $self->{nc};
3884     !!!next-input-character;
3885     redo A;
3886 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3887     ($self->{nc} == 0x0059 or # Y
3888     $self->{nc} == 0x0079)) { # y
3889     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3890     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3891     text => 'ENTITY',
3892     line => $self->{line_prev},
3893     column => $self->{column_prev} - 4);
3894     }
3895     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3896 wakaba 1.14 line => $self->{line_prev},
3897     column => $self->{column_prev} - 6};
3898     $self->{state} = DOCTYPE_MD_STATE;
3899     !!!next-input-character;
3900     redo A;
3901     } else {
3902     !!!parse-error (type => 'bogus comment',
3903     line => $self->{line_prev},
3904     column => $self->{column_prev} - 1
3905     - (length $self->{kwd})
3906     + 1 * ($self->{nc} == -1));
3907     $self->{state} = BOGUS_COMMENT_STATE;
3908     ## Reconsume.
3909     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3910     redo A;
3911     }
3912     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3913 wakaba 1.17 if ($self->{nc} == [
3914     undef,
3915     undef,
3916     0x0045, # E
3917     0x004D, # M
3918     0x0045, # E
3919     0x004E, # N
3920     ]->[length $self->{kwd}] or
3921     $self->{nc} == [
3922     undef,
3923     undef,
3924     0x0065, # e
3925     0x006D, # m
3926     0x0065, # e
3927     0x006E, # n
3928     ]->[length $self->{kwd}]) {
3929 wakaba 1.14 ## Stay in the state.
3930     $self->{kwd} .= chr $self->{nc};
3931     !!!next-input-character;
3932     redo A;
3933 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3934     ($self->{nc} == 0x0054 or # T
3935     $self->{nc} == 0x0074)) { # t
3936     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3937     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3938     text => 'ELEMENT',
3939     line => $self->{line_prev},
3940     column => $self->{column_prev} - 5);
3941     }
3942 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3943     line => $self->{line_prev},
3944 wakaba 1.23 column => $self->{column_prev} - 7};
3945 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3946     !!!next-input-character;
3947     redo A;
3948     } else {
3949     !!!parse-error (type => 'bogus comment',
3950     line => $self->{line_prev},
3951     column => $self->{column_prev} - 1
3952     - (length $self->{kwd})
3953     + 1 * ($self->{nc} == -1));
3954     $self->{state} = BOGUS_COMMENT_STATE;
3955     ## Reconsume.
3956     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3957     redo A;
3958     }
3959     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3960 wakaba 1.17 if ($self->{nc} == [
3961     undef,
3962     0x0054, # T
3963     0x0054, # T
3964     0x004C, # L
3965     0x0049, # I
3966     0x0053, # S
3967     ]->[length $self->{kwd}] or
3968     $self->{nc} == [
3969     undef,
3970     0x0074, # t
3971     0x0074, # t
3972     0x006C, # l
3973     0x0069, # i
3974     0x0073, # s
3975     ]->[length $self->{kwd}]) {
3976 wakaba 1.14 ## Stay in the state.
3977     $self->{kwd} .= chr $self->{nc};
3978     !!!next-input-character;
3979     redo A;
3980 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3981     ($self->{nc} == 0x0054 or # T
3982     $self->{nc} == 0x0074)) { # t
3983     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3984     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3985     text => 'ATTLIST',
3986     line => $self->{line_prev},
3987     column => $self->{column_prev} - 5);
3988     }
3989 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3990 wakaba 1.15 attrdefs => [],
3991 wakaba 1.14 line => $self->{line_prev},
3992 wakaba 1.23 column => $self->{column_prev} - 7};
3993 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3994     !!!next-input-character;
3995     redo A;
3996     } else {
3997     !!!parse-error (type => 'bogus comment',
3998     line => $self->{line_prev},
3999     column => $self->{column_prev} - 1
4000     - (length $self->{kwd})
4001     + 1 * ($self->{nc} == -1));
4002     $self->{state} = BOGUS_COMMENT_STATE;
4003     ## Reconsume.
4004     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4005     redo A;
4006     }
4007     } elsif ($self->{state} == MD_NOTATION_STATE) {
4008 wakaba 1.17 if ($self->{nc} == [
4009     undef,
4010     0x004F, # O
4011     0x0054, # T
4012     0x0041, # A
4013     0x0054, # T
4014     0x0049, # I
4015     0x004F, # O
4016     ]->[length $self->{kwd}] or
4017     $self->{nc} == [
4018     undef,
4019     0x006F, # o
4020     0x0074, # t
4021     0x0061, # a
4022     0x0074, # t
4023     0x0069, # i
4024     0x006F, # o
4025     ]->[length $self->{kwd}]) {
4026 wakaba 1.14 ## Stay in the state.
4027     $self->{kwd} .= chr $self->{nc};
4028     !!!next-input-character;
4029     redo A;
4030 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4031     ($self->{nc} == 0x004E or # N
4032     $self->{nc} == 0x006E)) { # n
4033     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4034     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4035     text => 'NOTATION',
4036     line => $self->{line_prev},
4037     column => $self->{column_prev} - 6);
4038     }
4039 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4040     line => $self->{line_prev},
4041 wakaba 1.23 column => $self->{column_prev} - 8};
4042 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4043     !!!next-input-character;
4044     redo A;
4045     } else {
4046     !!!parse-error (type => 'bogus comment',
4047     line => $self->{line_prev},
4048     column => $self->{column_prev} - 1
4049     - (length $self->{kwd})
4050     + 1 * ($self->{nc} == -1));
4051     $self->{state} = BOGUS_COMMENT_STATE;
4052     ## Reconsume.
4053     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4054     redo A;
4055     }
4056     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4057     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4058     ## "DOCTYPE NOTATION state".
4059    
4060     if ($is_space->{$self->{nc}}) {
4061     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4062     $self->{state} = BEFORE_MD_NAME_STATE;
4063     !!!next-input-character;
4064     redo A;
4065     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4066     $self->{nc} == 0x0025) { # %
4067     ## XML5: Switch to the "DOCTYPE bogus comment state".
4068     !!!parse-error (type => 'no space before md name'); ## TODO: type
4069     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4070     !!!next-input-character;
4071     redo A;
4072     } elsif ($self->{nc} == -1) {
4073     !!!parse-error (type => 'unclosed md'); ## TODO: type
4074     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4075     ## Reconsume.
4076     redo A;
4077     } elsif ($self->{nc} == 0x003E) { # >
4078     ## XML5: Switch to the "DOCTYPE bogus comment state".
4079     !!!parse-error (type => 'no md name'); ## TODO: type
4080     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4081     !!!next-input-character;
4082     redo A;
4083     } else {
4084     ## XML5: Switch to the "DOCTYPE bogus comment state".
4085     !!!parse-error (type => 'no space before md name'); ## TODO: type
4086     $self->{state} = BEFORE_MD_NAME_STATE;
4087     redo A;
4088     }
4089     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4090     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4091     ## before state", "DOCTYPE ATTLIST name before state".
4092    
4093     if ($is_space->{$self->{nc}}) {
4094     ## Stay in the state.
4095     !!!next-input-character;
4096     redo A;
4097     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4098     $self->{nc} == 0x0025) { # %
4099     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4100     !!!next-input-character;
4101     redo A;
4102     } elsif ($self->{nc} == 0x003E) { # >
4103     ## XML5: Same as "Anything else".
4104     !!!parse-error (type => 'no md name'); ## TODO: type
4105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4106     !!!next-input-character;
4107     redo A;
4108     } elsif ($self->{nc} == -1) {
4109     !!!parse-error (type => 'unclosed md'); ## TODO: type
4110     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4111     ## Reconsume.
4112     redo A;
4113     } else {
4114     ## XML5: [ATTLIST] Not defined yet.
4115     $self->{ct}->{name} .= chr $self->{nc};
4116     $self->{state} = MD_NAME_STATE;
4117     !!!next-input-character;
4118     redo A;
4119     }
4120     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4121     if ($is_space->{$self->{nc}}) {
4122     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4123     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4124     $self->{state} = BEFORE_MD_NAME_STATE;
4125     !!!next-input-character;
4126     redo A;
4127     } elsif ($self->{nc} == 0x003E) { # >
4128     ## XML5: Same as "Anything else".
4129     !!!parse-error (type => 'no md name'); ## TODO: type
4130     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4131     !!!next-input-character;
4132     redo A;
4133     } elsif ($self->{nc} == -1) {
4134     !!!parse-error (type => 'unclosed md');
4135     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4136     ## Reconsume.
4137     redo A;
4138     } else {
4139     ## XML5: No parse error.
4140     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4141     $self->{state} = BOGUS_COMMENT_STATE;
4142     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4143     ## Reconsume.
4144     redo A;
4145     }
4146     } elsif ($self->{state} == MD_NAME_STATE) {
4147     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4148    
4149     if ($is_space->{$self->{nc}}) {
4150 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4151     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4152     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4153 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4154 wakaba 1.16 } else { # ENTITY/NOTATION
4155     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4156     }
4157 wakaba 1.14 !!!next-input-character;
4158     redo A;
4159     } elsif ($self->{nc} == 0x003E) { # >
4160     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4161     #
4162     } else {
4163 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4164 wakaba 1.14 }
4165     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4166     !!!next-input-character;
4167     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4168     redo A;
4169     } elsif ($self->{nc} == -1) {
4170     ## XML5: [ATTLIST] No parse error.
4171     !!!parse-error (type => 'unclosed md');
4172     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4173     ## Reconsume.
4174     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4175     redo A;
4176     } else {
4177     ## XML5: [ATTLIST] Not defined yet.
4178     $self->{ct}->{name} .= chr $self->{nc};
4179     ## Stay in the state.
4180     !!!next-input-character;
4181     redo A;
4182     }
4183     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4184     if ($is_space->{$self->{nc}}) {
4185     ## Stay in the state.
4186     !!!next-input-character;
4187     redo A;
4188     } elsif ($self->{nc} == 0x003E) { # >
4189     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4190     !!!next-input-character;
4191     !!!emit ($self->{ct}); # ATTLIST
4192     redo A;
4193     } elsif ($self->{nc} == -1) {
4194     ## XML5: No parse error.
4195     !!!parse-error (type => 'unclosed md'); ## TODO: type
4196     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4197 wakaba 1.15 !!!emit ($self->{ct});
4198     redo A;
4199     } else {
4200     ## XML5: Not defined yet.
4201     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4202     tokens => [],
4203     line => $self->{line}, column => $self->{column}};
4204     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4205     !!!next-input-character;
4206     redo A;
4207     }
4208     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4209     if ($is_space->{$self->{nc}}) {
4210     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4211     !!!next-input-character;
4212     redo A;
4213     } elsif ($self->{nc} == 0x003E) { # >
4214     ## XML5: Same as "anything else".
4215     !!!parse-error (type => 'no attr type'); ## TODO: type
4216     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4217     !!!next-input-character;
4218     !!!emit ($self->{ct}); # ATTLIST
4219     redo A;
4220     } elsif ($self->{nc} == 0x0028) { # (
4221     ## XML5: Same as "anything else".
4222     !!!parse-error (type => 'no space before paren'); ## TODO: type
4223     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4224     !!!next-input-character;
4225     redo A;
4226     } elsif ($self->{nc} == -1) {
4227     ## XML5: No parse error.
4228     !!!parse-error (type => 'unclosed md'); ## TODO: type
4229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4230     !!!next-input-character;
4231     !!!emit ($self->{ct}); # ATTLIST
4232     redo A;
4233     } else {
4234     ## XML5: Not defined yet.
4235     $self->{ca}->{name} .= chr $self->{nc};
4236     ## Stay in the state.
4237     !!!next-input-character;
4238     redo A;
4239     }
4240     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4241     if ($is_space->{$self->{nc}}) {
4242     ## Stay in the state.
4243     !!!next-input-character;
4244     redo A;
4245     } elsif ($self->{nc} == 0x003E) { # >
4246     ## XML5: Same as "anything else".
4247     !!!parse-error (type => 'no attr type'); ## TODO: type
4248     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4249     !!!next-input-character;
4250     !!!emit ($self->{ct}); # ATTLIST
4251     redo A;
4252     } elsif ($self->{nc} == 0x0028) { # (
4253     ## XML5: Same as "anything else".
4254     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4255     !!!next-input-character;
4256     redo A;
4257     } elsif ($self->{nc} == -1) {
4258     ## XML5: No parse error.
4259     !!!parse-error (type => 'unclosed md'); ## TODO: type
4260     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4261     !!!next-input-character;
4262     !!!emit ($self->{ct});
4263 wakaba 1.14 redo A;
4264     } else {
4265     ## XML5: Not defined yet.
4266 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4267     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4268     !!!next-input-character;
4269     redo A;
4270     }
4271     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4272     if ($is_space->{$self->{nc}}) {
4273     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4274     !!!next-input-character;
4275     redo A;
4276     } elsif ($self->{nc} == 0x0023) { # #
4277     ## XML5: Same as "anything else".
4278     !!!parse-error (type => 'no space before default value'); ## TODO: type
4279     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4280     !!!next-input-character;
4281     redo A;
4282     } elsif ($self->{nc} == 0x0022) { # "
4283     ## XML5: Same as "anything else".
4284     !!!parse-error (type => 'no space before default value'); ## TODO: type
4285     $self->{ca}->{value} = '';
4286     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4287     !!!next-input-character;
4288     redo A;
4289     } elsif ($self->{nc} == 0x0027) { # '
4290     ## XML5: Same as "anything else".
4291     !!!parse-error (type => 'no space before default value'); ## TODO: type
4292     $self->{ca}->{value} = '';
4293     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4294     !!!next-input-character;
4295     redo A;
4296     } elsif ($self->{nc} == 0x003E) { # >
4297     ## XML5: Same as "anything else".
4298     !!!parse-error (type => 'no attr default'); ## TODO: type
4299     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4300     !!!next-input-character;
4301     !!!emit ($self->{ct}); # ATTLIST
4302     redo A;
4303     } elsif ($self->{nc} == 0x0028) { # (
4304     ## XML5: Same as "anything else".
4305     !!!parse-error (type => 'no space before paren'); ## TODO: type
4306     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4307     !!!next-input-character;
4308     redo A;
4309     } elsif ($self->{nc} == -1) {
4310     ## XML5: No parse error.
4311     !!!parse-error (type => 'unclosed md'); ## TODO: type
4312     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4313     !!!next-input-character;
4314     !!!emit ($self->{ct});
4315     redo A;
4316     } else {
4317     ## XML5: Not defined yet.
4318     $self->{ca}->{type} .= chr $self->{nc};
4319     ## Stay in the state.
4320     !!!next-input-character;
4321     redo A;
4322     }
4323     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4324     if ($is_space->{$self->{nc}}) {
4325     ## Stay in the state.
4326     !!!next-input-character;
4327     redo A;
4328     } elsif ($self->{nc} == 0x0028) { # (
4329     ## XML5: Same as "anything else".
4330     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4331     !!!next-input-character;
4332     redo A;
4333     } elsif ($self->{nc} == 0x0023) { # #
4334     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4335     !!!next-input-character;
4336     redo A;
4337     } elsif ($self->{nc} == 0x0022) { # "
4338     ## XML5: Same as "anything else".
4339     $self->{ca}->{value} = '';
4340     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4341     !!!next-input-character;
4342     redo A;
4343     } elsif ($self->{nc} == 0x0027) { # '
4344     ## XML5: Same as "anything else".
4345     $self->{ca}->{value} = '';
4346     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4347     !!!next-input-character;
4348     redo A;
4349     } elsif ($self->{nc} == 0x003E) { # >
4350     ## XML5: Same as "anything else".
4351     !!!parse-error (type => 'no attr default'); ## TODO: type
4352     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4353     !!!next-input-character;
4354     !!!emit ($self->{ct}); # ATTLIST
4355     redo A;
4356     } elsif ($self->{nc} == -1) {
4357     ## XML5: No parse error.
4358     !!!parse-error (type => 'unclosed md'); ## TODO: type
4359     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4360     !!!next-input-character;
4361     !!!emit ($self->{ct});
4362     redo A;
4363     } else {
4364     ## XML5: Switch to the "DOCTYPE bogus comment state".
4365     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4366     $self->{ca}->{value} = '';
4367     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4368     ## Reconsume.
4369     redo A;
4370     }
4371     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4372     if ($is_space->{$self->{nc}}) {
4373     ## Stay in the state.
4374     !!!next-input-character;
4375     redo A;
4376     } elsif ($self->{nc} == 0x007C) { # |
4377     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4378     ## Stay in the state.
4379     !!!next-input-character;
4380     redo A;
4381     } elsif ($self->{nc} == 0x0029) { # )
4382     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4383     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4384     !!!next-input-character;
4385     redo A;
4386     } elsif ($self->{nc} == 0x003E) { # >
4387     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4388     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4389     !!!next-input-character;
4390     !!!emit ($self->{ct}); # ATTLIST
4391     redo A;
4392     } elsif ($self->{nc} == -1) {
4393     ## XML5: No parse error.
4394     !!!parse-error (type => 'unclosed md'); ## TODO: type
4395     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4396     !!!next-input-character;
4397     !!!emit ($self->{ct});
4398     redo A;
4399     } else {
4400     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4401     $self->{state} = ALLOWED_TOKEN_STATE;
4402     !!!next-input-character;
4403     redo A;
4404     }
4405     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4406     if ($is_space->{$self->{nc}}) {
4407     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4408     !!!next-input-character;
4409     redo A;
4410     } elsif ($self->{nc} == 0x007C) { # |
4411     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4412     !!!next-input-character;
4413     redo A;
4414     } elsif ($self->{nc} == 0x0029) { # )
4415     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4416     !!!next-input-character;
4417     redo A;
4418     } elsif ($self->{nc} == 0x003E) { # >
4419     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4420     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4421     !!!next-input-character;
4422     !!!emit ($self->{ct}); # ATTLIST
4423     redo A;
4424     } elsif ($self->{nc} == -1) {
4425     ## XML5: No parse error.
4426     !!!parse-error (type => 'unclosed md'); ## TODO: type
4427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4428     !!!next-input-character;
4429     !!!emit ($self->{ct});
4430     redo A;
4431     } else {
4432     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4433     ## Stay in the state.
4434     !!!next-input-character;
4435     redo A;
4436     }
4437     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4438     if ($is_space->{$self->{nc}}) {
4439     ## Stay in the state.
4440     !!!next-input-character;
4441     redo A;
4442     } elsif ($self->{nc} == 0x007C) { # |
4443     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4444     !!!next-input-character;
4445     redo A;
4446     } elsif ($self->{nc} == 0x0029) { # )
4447     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4448     !!!next-input-character;
4449     redo A;
4450     } elsif ($self->{nc} == 0x003E) { # >
4451     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4452     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4453     !!!next-input-character;
4454     !!!emit ($self->{ct}); # ATTLIST
4455     redo A;
4456     } elsif ($self->{nc} == -1) {
4457     ## XML5: No parse error.
4458     !!!parse-error (type => 'unclosed md'); ## TODO: type
4459     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4460     !!!next-input-character;
4461     !!!emit ($self->{ct});
4462     redo A;
4463     } else {
4464     !!!parse-error (type => 'space in allowed token', ## TODO: type
4465     line => $self->{line_prev},
4466     column => $self->{column_prev});
4467     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4468     $self->{state} = ALLOWED_TOKEN_STATE;
4469     !!!next-input-character;
4470     redo A;
4471     }
4472     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4473     if ($is_space->{$self->{nc}}) {
4474     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4475     !!!next-input-character;
4476     redo A;
4477     } elsif ($self->{nc} == 0x0023) { # #
4478     !!!parse-error (type => 'no space before default value'); ## TODO: type
4479     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4480     !!!next-input-character;
4481     redo A;
4482     } elsif ($self->{nc} == 0x0022) { # "
4483     !!!parse-error (type => 'no space before default value'); ## TODO: type
4484     $self->{ca}->{value} = '';
4485     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4486     !!!next-input-character;
4487     redo A;
4488     } elsif ($self->{nc} == 0x0027) { # '
4489     !!!parse-error (type => 'no space before default value'); ## TODO: type
4490     $self->{ca}->{value} = '';
4491     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4492     !!!next-input-character;
4493     redo A;
4494     } elsif ($self->{nc} == 0x003E) { # >
4495     !!!parse-error (type => 'no attr default'); ## TODO: type
4496     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4497     !!!next-input-character;
4498     !!!emit ($self->{ct}); # ATTLIST
4499     redo A;
4500     } elsif ($self->{nc} == -1) {
4501     !!!parse-error (type => 'unclosed md'); ## TODO: type
4502     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4503     !!!next-input-character;
4504     !!!emit ($self->{ct});
4505     redo A;
4506     } else {
4507     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4508     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4509     ## Reconsume.
4510     redo A;
4511     }
4512     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4513     if ($is_space->{$self->{nc}}) {
4514     ## Stay in the state.
4515     !!!next-input-character;
4516     redo A;
4517     } elsif ($self->{nc} == 0x0023) { # #
4518     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4519     !!!next-input-character;
4520     redo A;
4521     } elsif ($self->{nc} == 0x0022) { # "
4522     $self->{ca}->{value} = '';
4523     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4524     !!!next-input-character;
4525     redo A;
4526     } elsif ($self->{nc} == 0x0027) { # '
4527     $self->{ca}->{value} = '';
4528     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4529     !!!next-input-character;
4530     redo A;
4531     } elsif ($self->{nc} == 0x003E) { # >
4532     !!!parse-error (type => 'no attr default'); ## TODO: type
4533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4534     !!!next-input-character;
4535     !!!emit ($self->{ct}); # ATTLIST
4536     redo A;
4537     } elsif ($self->{nc} == -1) {
4538     !!!parse-error (type => 'unclosed md'); ## TODO: type
4539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540     !!!next-input-character;
4541     !!!emit ($self->{ct});
4542     redo A;
4543     } else {
4544     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4545     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4546     ## Reconsume.
4547     redo A;
4548     }
4549     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4550     if ($is_space->{$self->{nc}}) {
4551     ## XML5: No parse error.
4552     !!!parse-error (type => 'no default type'); ## TODO: type
4553 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4554 wakaba 1.14 ## Reconsume.
4555     redo A;
4556 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4557     ## XML5: Same as "anything else".
4558     $self->{ca}->{value} = '';
4559     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4560     !!!next-input-character;
4561     redo A;
4562     } elsif ($self->{nc} == 0x0027) { # '
4563     ## XML5: Same as "anything else".
4564     $self->{ca}->{value} = '';
4565     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4566     !!!next-input-character;
4567     redo A;
4568     } elsif ($self->{nc} == 0x003E) { # >
4569     ## XML5: Same as "anything else".
4570     !!!parse-error (type => 'no attr default'); ## TODO: type
4571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4572     !!!next-input-character;
4573     !!!emit ($self->{ct}); # ATTLIST
4574     redo A;
4575     } elsif ($self->{nc} == -1) {
4576     ## XML5: No parse error.
4577     !!!parse-error (type => 'unclosed md'); ## TODO: type
4578     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4579     !!!next-input-character;
4580     !!!emit ($self->{ct});
4581     redo A;
4582     } else {
4583     $self->{ca}->{default} = chr $self->{nc};
4584     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4585     !!!next-input-character;
4586     redo A;
4587 wakaba 1.14 }
4588 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4589     if ($is_space->{$self->{nc}}) {
4590     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4591     !!!next-input-character;
4592     redo A;
4593     } elsif ($self->{nc} == 0x0022) { # "
4594     ## XML5: Same as "anything else".
4595     !!!parse-error (type => 'no space before default value'); ## TODO: type
4596     $self->{ca}->{value} = '';
4597     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4598     !!!next-input-character;
4599     redo A;
4600     } elsif ($self->{nc} == 0x0027) { # '
4601     ## XML5: Same as "anything else".
4602     !!!parse-error (type => 'no space before default value'); ## TODO: type
4603     $self->{ca}->{value} = '';
4604     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4605     !!!next-input-character;
4606     redo A;
4607     } elsif ($self->{nc} == 0x003E) { # >
4608     ## XML5: Same as "anything else".
4609     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4610     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4611     !!!next-input-character;
4612     !!!emit ($self->{ct}); # ATTLIST
4613     redo A;
4614     } elsif ($self->{nc} == -1) {
4615     ## XML5: No parse error.
4616     !!!parse-error (type => 'unclosed md'); ## TODO: type
4617     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4619     !!!next-input-character;
4620     !!!emit ($self->{ct});
4621     redo A;
4622     } else {
4623     $self->{ca}->{default} .= chr $self->{nc};
4624     ## Stay in the state.
4625     !!!next-input-character;
4626     redo A;
4627     }
4628     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4629     if ($is_space->{$self->{nc}}) {
4630     ## Stay in the state.
4631     !!!next-input-character;
4632     redo A;
4633     } elsif ($self->{nc} == 0x0022) { # "
4634     $self->{ca}->{value} = '';
4635     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4636     !!!next-input-character;
4637     redo A;
4638     } elsif ($self->{nc} == 0x0027) { # '
4639     $self->{ca}->{value} = '';
4640     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4641     !!!next-input-character;
4642     redo A;
4643     } elsif ($self->{nc} == 0x003E) { # >
4644     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4645     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4646     !!!next-input-character;
4647     !!!emit ($self->{ct}); # ATTLIST
4648     redo A;
4649     } elsif ($self->{nc} == -1) {
4650     ## XML5: No parse error.
4651     !!!parse-error (type => 'unclosed md'); ## TODO: type
4652     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4653     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4654     !!!next-input-character;
4655     !!!emit ($self->{ct});
4656     redo A;
4657     } else {
4658     ## XML5: Not defined yet.
4659     if ($self->{ca}->{default} eq 'FIXED') {
4660     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4661     } else {
4662     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4663     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4664     }
4665     ## Reconsume.
4666     redo A;
4667     }
4668     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4669     if ($is_space->{$self->{nc}} or
4670     $self->{nc} == -1 or
4671     $self->{nc} == 0x003E) { # >
4672     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4673     ## Reconsume.
4674     redo A;
4675     } else {
4676     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4677     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4678     ## Reconsume.
4679     redo A;
4680 wakaba 1.16 }
4681 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4682     ## ASCII case-insensitive
4683     if ($self->{nc} == [
4684     undef,
4685     0x0044, # D
4686     0x0041, # A
4687     0x0054, # T
4688     ]->[length $self->{kwd}] or
4689     $self->{nc} == [
4690     undef,
4691     0x0064, # d
4692     0x0061, # a
4693     0x0074, # t
4694     ]->[length $self->{kwd}]) {
4695     !!!cp (172.2);
4696     ## Stay in the state.
4697     $self->{kwd} .= chr $self->{nc};
4698     !!!next-input-character;
4699     redo A;
4700     } elsif ((length $self->{kwd}) == 4 and
4701     ($self->{nc} == 0x0041 or # A
4702     $self->{nc} == 0x0061)) { # a
4703     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4704     !!!cp (172.3);
4705     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4706     text => 'NDATA',
4707     line => $self->{line_prev},
4708     column => $self->{column_prev} - 4);
4709     } else {
4710     !!!cp (172.4);
4711     }
4712     $self->{state} = AFTER_NDATA_STATE;
4713     !!!next-input-character;
4714     redo A;
4715     } else {
4716     !!!parse-error (type => 'string after literal', ## TODO: type
4717     line => $self->{line_prev},
4718     column => $self->{column_prev} + 1
4719     - length $self->{kwd});
4720     !!!cp (172.5);
4721     $self->{state} = BOGUS_MD_STATE;
4722     ## Reconsume.
4723     redo A;
4724     }
4725     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4726     if ($is_space->{$self->{nc}}) {
4727     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4728     !!!next-input-character;
4729     redo A;
4730     } elsif ($self->{nc} == 0x003E) { # >
4731     !!!parse-error (type => 'no notation name'); ## TODO: type
4732     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4733     !!!next-input-character;
4734     !!!emit ($self->{ct}); # ENTITY
4735     redo A;
4736     } elsif ($self->{nc} == -1) {
4737     !!!parse-error (type => 'unclosed md'); ## TODO: type
4738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4739     !!!next-input-character;
4740     !!!emit ($self->{ct}); # ENTITY
4741     redo A;
4742     } else {
4743     !!!parse-error (type => 'string after literal', ## TODO: type
4744     line => $self->{line_prev},
4745     column => $self->{column_prev} + 1
4746     - length $self->{kwd});
4747     $self->{state} = BOGUS_MD_STATE;
4748     ## Reconsume.
4749     redo A;
4750     }
4751     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4752     if ($is_space->{$self->{nc}}) {
4753     ## Stay in the state.
4754     !!!next-input-character;
4755     redo A;
4756     } elsif ($self->{nc} == 0x003E) { # >
4757     !!!parse-error (type => 'no notation name'); ## TODO: type
4758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4759     !!!next-input-character;
4760     !!!emit ($self->{ct}); # ENTITY
4761     redo A;
4762     } elsif ($self->{nc} == -1) {
4763     !!!parse-error (type => 'unclosed md'); ## TODO: type
4764     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4765     !!!next-input-character;
4766     !!!emit ($self->{ct}); # ENTITY
4767     redo A;
4768     } else {
4769     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4770     $self->{state} = NOTATION_NAME_STATE;
4771     !!!next-input-character;
4772     redo A;
4773     }
4774     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4775     if ($is_space->{$self->{nc}}) {
4776 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4777 wakaba 1.18 !!!next-input-character;
4778     redo A;
4779     } elsif ($self->{nc} == 0x003E) { # >
4780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4781     !!!next-input-character;
4782     !!!emit ($self->{ct}); # ENTITY
4783     redo A;
4784     } elsif ($self->{nc} == -1) {
4785     !!!parse-error (type => 'unclosed md'); ## TODO: type
4786     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4787     !!!next-input-character;
4788     !!!emit ($self->{ct}); # ENTITY
4789     redo A;
4790     } else {
4791     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4792     ## Stay in the state.
4793     !!!next-input-character;
4794     redo A;
4795     }
4796 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4797     if ($self->{nc} == 0x0022) { # "
4798 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4799 wakaba 1.19 !!!next-input-character;
4800     redo A;
4801     } elsif ($self->{nc} == 0x0026) { # &
4802     $self->{prev_state} = $self->{state};
4803     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4804     $self->{entity_add} = 0x0022; # "
4805     !!!next-input-character;
4806     redo A;
4807     ## TODO: %
4808     } elsif ($self->{nc} == -1) {
4809     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4810     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4811     ## Reconsume.
4812     !!!emit ($self->{ct}); # ENTITY
4813     redo A;
4814     } else {
4815     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4816     !!!next-input-character;
4817     redo A;
4818     }
4819     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4820     if ($self->{nc} == 0x0027) { # '
4821 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4822 wakaba 1.19 !!!next-input-character;
4823     redo A;
4824     } elsif ($self->{nc} == 0x0026) { # &
4825     $self->{prev_state} = $self->{state};
4826     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4827     $self->{entity_add} = 0x0027; # '
4828     !!!next-input-character;
4829     redo A;
4830     ## TODO: %
4831     } elsif ($self->{nc} == -1) {
4832     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4833     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4834     ## Reconsume.
4835     !!!emit ($self->{ct}); # ENTITY
4836     redo A;
4837     } else {
4838     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4839     !!!next-input-character;
4840     redo A;
4841     }
4842     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4843     if ($is_space->{$self->{nc}} or
4844     {
4845     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4846     $self->{entity_add} => 1,
4847     }->{$self->{nc}}) {
4848 wakaba 1.22 !!!parse-error (type => 'bare ero',
4849     line => $self->{line_prev},
4850     column => $self->{column_prev}
4851     + ($self->{nc} == -1 ? 1 : 0));
4852 wakaba 1.19 ## Don't consume
4853     ## Return nothing.
4854     #
4855     } elsif ($self->{nc} == 0x0023) { # #
4856     $self->{ca} = $self->{ct};
4857     $self->{state} = ENTITY_HASH_STATE;
4858     $self->{kwd} = '#';
4859     !!!next-input-character;
4860     redo A;
4861     } else {
4862     #
4863     }
4864    
4865     $self->{ct}->{value} .= '&';
4866     $self->{state} = $self->{prev_state};
4867     ## Reconsume.
4868     redo A;
4869 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4870     if ($is_space->{$self->{nc}}) {
4871     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4872     !!!next-input-character;
4873     redo A;
4874     } elsif ($self->{nc} == 0x0028) { # (
4875     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4876     $self->{ct}->{content} = ['('];
4877     $self->{group_depth} = 1;
4878     !!!next-input-character;
4879     redo A;
4880     } elsif ($self->{nc} == 0x003E) { # >
4881     !!!parse-error (type => 'no md def'); ## TODO: type
4882     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4883     !!!next-input-character;
4884     !!!emit ($self->{ct}); # ELEMENT
4885     redo A;
4886     } elsif ($self->{nc} == -1) {
4887     !!!parse-error (type => 'unclosed md'); ## TODO: type
4888     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4889     !!!next-input-character;
4890     !!!emit ($self->{ct}); # ELEMENT
4891     redo A;
4892     } else {
4893     $self->{ct}->{content} = [chr $self->{nc}];
4894     $self->{state} = CONTENT_KEYWORD_STATE;
4895     !!!next-input-character;
4896     redo A;
4897     }
4898     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4899     if ($is_space->{$self->{nc}}) {
4900     $self->{state} = AFTER_MD_DEF_STATE;
4901     !!!next-input-character;
4902     redo A;
4903     } elsif ($self->{nc} == 0x003E) { # >
4904     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4905     !!!next-input-character;
4906     !!!emit ($self->{ct}); # ELEMENT
4907     redo A;
4908     } elsif ($self->{nc} == -1) {
4909     !!!parse-error (type => 'unclosed md'); ## TODO: type
4910     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4911     !!!next-input-character;
4912     !!!emit ($self->{ct}); # ELEMENT
4913     redo A;
4914     } else {
4915     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4916     ## Stay in the state.
4917     !!!next-input-character;
4918     redo A;
4919     }
4920     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4921     if ($is_space->{$self->{nc}}) {
4922     ## Stay in the state.
4923     !!!next-input-character;
4924     redo A;
4925     } elsif ($self->{nc} == 0x0028) { # (
4926     $self->{group_depth}++;
4927     push @{$self->{ct}->{content}}, chr $self->{nc};
4928     ## Stay in the state.
4929     !!!next-input-character;
4930     redo A;
4931     } elsif ($self->{nc} == 0x007C or # |
4932     $self->{nc} == 0x002C) { # ,
4933     !!!parse-error (type => 'empty element name'); ## TODO: type
4934     ## Stay in the state.
4935     !!!next-input-character;
4936     redo A;
4937     } elsif ($self->{nc} == 0x0029) { # )
4938     !!!parse-error (type => 'empty element name'); ## TODO: type
4939     push @{$self->{ct}->{content}}, chr $self->{nc};
4940     $self->{group_depth}--;
4941     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4942     !!!next-input-character;
4943     redo A;
4944     } elsif ($self->{nc} == 0x003E) { # >
4945     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4946     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4947     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4948     !!!next-input-character;
4949     !!!emit ($self->{ct}); # ELEMENT
4950     redo A;
4951     } elsif ($self->{nc} == -1) {
4952     !!!parse-error (type => 'unclosed md'); ## TODO: type
4953     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4955     !!!next-input-character;
4956     !!!emit ($self->{ct}); # ELEMENT
4957     redo A;
4958     } else {
4959     push @{$self->{ct}->{content}}, chr $self->{nc};
4960     $self->{state} = CM_ELEMENT_NAME_STATE;
4961     !!!next-input-character;
4962     redo A;
4963     }
4964     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4965     if ($is_space->{$self->{nc}}) {
4966     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4967     !!!next-input-character;
4968     redo A;
4969     } elsif ($self->{nc} == 0x002A or # *
4970     $self->{nc} == 0x002B or # +
4971     $self->{nc} == 0x003F) { # ?
4972     push @{$self->{ct}->{content}}, chr $self->{nc};
4973     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4974     !!!next-input-character;
4975     redo A;
4976     } elsif ($self->{nc} == 0x007C or # |
4977     $self->{nc} == 0x002C) { # ,
4978     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4979     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4980     !!!next-input-character;
4981     redo A;
4982     } elsif ($self->{nc} == 0x0029) { # )
4983     $self->{group_depth}--;
4984     push @{$self->{ct}->{content}}, chr $self->{nc};
4985     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4986     !!!next-input-character;
4987     redo A;
4988     } elsif ($self->{nc} == 0x003E) { # >
4989     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4990     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4991     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4992     !!!next-input-character;
4993     !!!emit ($self->{ct}); # ELEMENT
4994     redo A;
4995     } elsif ($self->{nc} == -1) {
4996     !!!parse-error (type => 'unclosed md'); ## TODO: type
4997     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4999     !!!next-input-character;
5000     !!!emit ($self->{ct}); # ELEMENT
5001     redo A;
5002     } else {
5003     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5004     ## Stay in the state.
5005     !!!next-input-character;
5006     redo A;
5007     }
5008     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5009     if ($is_space->{$self->{nc}}) {
5010     ## Stay in the state.
5011     !!!next-input-character;
5012     redo A;
5013     } elsif ($self->{nc} == 0x007C or # |
5014     $self->{nc} == 0x002C) { # ,
5015     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5016     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5017     !!!next-input-character;
5018     redo A;
5019     } elsif ($self->{nc} == 0x0029) { # )
5020     $self->{group_depth}--;
5021     push @{$self->{ct}->{content}}, chr $self->{nc};
5022     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5023     !!!next-input-character;
5024     redo A;
5025     } elsif ($self->{nc} == 0x003E) { # >
5026     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5027     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5028     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5029     !!!next-input-character;
5030     !!!emit ($self->{ct}); # ELEMENT
5031     redo A;
5032     } elsif ($self->{nc} == -1) {
5033     !!!parse-error (type => 'unclosed md'); ## TODO: type
5034     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5036     !!!next-input-character;
5037     !!!emit ($self->{ct}); # ELEMENT
5038     redo A;
5039     } else {
5040     !!!parse-error (type => 'after element name'); ## TODO: type
5041     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5042     $self->{state} = BOGUS_MD_STATE;
5043     !!!next-input-character;
5044     redo A;
5045     }
5046     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5047     if ($is_space->{$self->{nc}}) {
5048     if ($self->{group_depth}) {
5049     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5050     } else {
5051     $self->{state} = AFTER_MD_DEF_STATE;
5052     }
5053     !!!next-input-character;
5054     redo A;
5055     } elsif ($self->{nc} == 0x002A or # *
5056     $self->{nc} == 0x002B or # +
5057     $self->{nc} == 0x003F) { # ?
5058     push @{$self->{ct}->{content}}, chr $self->{nc};
5059     if ($self->{group_depth}) {
5060     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5061     } else {
5062     $self->{state} = AFTER_MD_DEF_STATE;
5063     }
5064     !!!next-input-character;
5065     redo A;
5066     } elsif ($self->{nc} == 0x0029) { # )
5067     if ($self->{group_depth}) {
5068     $self->{group_depth}--;
5069     push @{$self->{ct}->{content}}, chr $self->{nc};
5070     ## Stay in the state.
5071     !!!next-input-character;
5072     redo A;
5073     } else {
5074     !!!parse-error (type => 'string after md def'); ## TODO: type
5075     $self->{state} = BOGUS_MD_STATE;
5076     ## Reconsume.
5077     redo A;
5078     }
5079     } elsif ($self->{nc} == 0x003E) { # >
5080     if ($self->{group_depth}) {
5081     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5082     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5083     }
5084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5085     !!!next-input-character;
5086     !!!emit ($self->{ct}); # ELEMENT
5087     redo A;
5088     } elsif ($self->{nc} == -1) {
5089     !!!parse-error (type => 'unclosed md'); ## TODO: type
5090     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5091     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5092     !!!next-input-character;
5093     !!!emit ($self->{ct}); # ELEMENT
5094     redo A;
5095     } else {
5096     if ($self->{group_depth}) {
5097     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5098     } else {
5099     !!!parse-error (type => 'string after md def'); ## TODO: type
5100     $self->{state} = BOGUS_MD_STATE;
5101     }
5102     ## Reconsume.
5103     redo A;
5104     }
5105     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5106 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5107     ## Stay in the state.
5108     !!!next-input-character;
5109     redo A;
5110     } elsif ($self->{nc} == 0x003E) { # >
5111     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5112     !!!next-input-character;
5113 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5114 wakaba 1.18 redo A;
5115     } elsif ($self->{nc} == -1) {
5116     !!!parse-error (type => 'unclosed md'); ## TODO: type
5117     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5118     !!!next-input-character;
5119 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5120 wakaba 1.18 redo A;
5121     } else {
5122 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5123 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5124     ## Reconsume.
5125     redo A;
5126     }
5127 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5128     if ($self->{nc} == 0x003E) { # >
5129     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5130     !!!next-input-character;
5131     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5132     redo A;
5133     } elsif ($self->{nc} == -1) {
5134     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5135     ## Reconsume.
5136     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5137     redo A;
5138     } else {
5139     ## Stay in the state.
5140     !!!next-input-character;
5141     redo A;
5142     }
5143 wakaba 1.1 } else {
5144     die "$0: $self->{state}: Unknown state";
5145     }
5146     } # A
5147    
5148     die "$0: _get_next_token: unexpected case";
5149     } # _get_next_token
5150    
5151     1;
5152 wakaba 1.29 ## $Date: 2009/07/05 04:38:45 $
5153 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24