/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.27 - (hide annotations) (download) (as text)
Thu Jul 2 22:24:28 2009 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.26: +2 -6 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	2 Jul 2009 22:24:21 -0000
	* Tokenizer.pm.src: Reduced a parse error (HTML5 revision 3194).

2009-07-03  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.27 our $VERSION=do{my @r=(q$Revision: 1.26 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 wakaba 1.1 !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319 wakaba 1.25 } elsif ($self->{is_xml} and
1320     $is_space->{$self->{nc}}) {
1321     !!!cp (97.1);
1322     $self->{ca}->{value} .= ' ';
1323     ## Stay in the state.
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.1 } elsif ($self->{nc} == -1) {
1327     !!!parse-error (type => 'unclosed attribute value');
1328     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329     !!!cp (97);
1330     $self->{last_stag_name} = $self->{ct}->{tag_name};
1331 wakaba 1.15
1332     $self->{state} = DATA_STATE;
1333     $self->{s_kwd} = '';
1334     ## reconsume
1335     !!!emit ($self->{ct}); # start tag
1336     redo A;
1337 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340     !!!cp (98);
1341     !!!parse-error (type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344     !!!cp (99);
1345     }
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # end tag
1351     redo A;
1352     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353     ## XML5: No parse error above; not defined yet.
1354     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356     ## Reconsume.
1357     !!!emit ($self->{ct}); # ATTLIST
1358     redo A;
1359 wakaba 1.1 } else {
1360     die "$0: $self->{ct}->{type}: Unknown token type";
1361     }
1362     } else {
1363 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1364 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365     !!!cp (100);
1366     ## XML5: Not a parse error.
1367     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368     } else {
1369     !!!cp (100.1);
1370     }
1371 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1372     $self->{read_until}->($self->{ca}->{value},
1373 wakaba 1.25 qq["&<\x09\x0C\x20],
1374 wakaba 1.1 length $self->{ca}->{value});
1375    
1376     ## Stay in the state
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382     ## ATTLIST attribute value single quoted state".
1383 wakaba 1.11
1384 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1385 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386     !!!cp (101.1);
1387     ## XML5: "DOCTYPE ATTLIST name after state".
1388     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390     } else {
1391     !!!cp (101);
1392     ## XML5: "Before attribute name state" (sic).
1393     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394     }
1395 wakaba 1.1 !!!next-input-character;
1396     redo A;
1397     } elsif ($self->{nc} == 0x0026) { # &
1398     !!!cp (102);
1399 wakaba 1.11 ## XML5: Not defined yet.
1400    
1401 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1402     ## "entity in attribute value state". In this implementation, the
1403     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404     ## implementation of the "consume a character reference" algorithm.
1405     $self->{entity_add} = 0x0027; # '
1406     $self->{prev_state} = $self->{state};
1407     $self->{state} = ENTITY_STATE;
1408     !!!next-input-character;
1409     redo A;
1410 wakaba 1.25 } elsif ($self->{is_xml} and
1411     $is_space->{$self->{nc}}) {
1412     !!!cp (103.1);
1413     $self->{ca}->{value} .= ' ';
1414     ## Stay in the state.
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.1 } elsif ($self->{nc} == -1) {
1418     !!!parse-error (type => 'unclosed attribute value');
1419     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420     !!!cp (103);
1421     $self->{last_stag_name} = $self->{ct}->{tag_name};
1422 wakaba 1.15
1423     $self->{state} = DATA_STATE;
1424     $self->{s_kwd} = '';
1425     ## reconsume
1426     !!!emit ($self->{ct}); # start tag
1427     redo A;
1428 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430     if ($self->{ct}->{attributes}) {
1431     !!!cp (104);
1432     !!!parse-error (type => 'end tag attribute');
1433     } else {
1434     ## NOTE: This state should never be reached.
1435     !!!cp (105);
1436     }
1437 wakaba 1.15
1438     $self->{state} = DATA_STATE;
1439     $self->{s_kwd} = '';
1440     ## reconsume
1441     !!!emit ($self->{ct}); # end tag
1442     redo A;
1443     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444     ## XML5: No parse error above; not defined yet.
1445     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447     ## Reconsume.
1448     !!!emit ($self->{ct}); # ATTLIST
1449     redo A;
1450 wakaba 1.1 } else {
1451     die "$0: $self->{ct}->{type}: Unknown token type";
1452     }
1453     } else {
1454 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1455 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456     !!!cp (106);
1457     ## XML5: Not a parse error.
1458     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459     } else {
1460     !!!cp (106.1);
1461     }
1462 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1463     $self->{read_until}->($self->{ca}->{value},
1464 wakaba 1.25 qq['&<\x09\x0C\x20],
1465 wakaba 1.1 length $self->{ca}->{value});
1466    
1467     ## Stay in the state
1468     !!!next-input-character;
1469     redo A;
1470     }
1471     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1473    
1474 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1475 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476     !!!cp (107.1);
1477     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479     } else {
1480     !!!cp (107);
1481     ## XML5: "Tag attribute name before state".
1482     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483     }
1484 wakaba 1.1 !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x0026) { # &
1487     !!!cp (108);
1488 wakaba 1.11
1489     ## XML5: Not defined yet.
1490    
1491 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1492     ## "entity in attribute value state". In this implementation, the
1493     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494     ## implementation of the "consume a character reference" algorithm.
1495     $self->{entity_add} = -1;
1496     $self->{prev_state} = $self->{state};
1497     $self->{state} = ENTITY_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x003E) { # >
1501     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502     !!!cp (109);
1503     $self->{last_stag_name} = $self->{ct}->{tag_name};
1504 wakaba 1.15
1505     $self->{state} = DATA_STATE;
1506     $self->{s_kwd} = '';
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # start tag
1509     redo A;
1510 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513     !!!cp (110);
1514     !!!parse-error (type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517     !!!cp (111);
1518     }
1519 wakaba 1.15
1520     $self->{state} = DATA_STATE;
1521     $self->{s_kwd} = '';
1522     !!!next-input-character;
1523     !!!emit ($self->{ct}); # end tag
1524     redo A;
1525     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528     !!!next-input-character;
1529     !!!emit ($self->{ct}); # ATTLIST
1530     redo A;
1531 wakaba 1.1 } else {
1532     die "$0: $self->{ct}->{type}: Unknown token type";
1533     }
1534     } elsif ($self->{nc} == -1) {
1535     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536     !!!cp (112);
1537 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1538 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539 wakaba 1.15
1540     $self->{state} = DATA_STATE;
1541     $self->{s_kwd} = '';
1542     ## reconsume
1543     !!!emit ($self->{ct}); # start tag
1544     redo A;
1545 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548     if ($self->{ct}->{attributes}) {
1549     !!!cp (113);
1550     !!!parse-error (type => 'end tag attribute');
1551     } else {
1552     ## NOTE: This state should never be reached.
1553     !!!cp (114);
1554     }
1555 wakaba 1.15
1556     $self->{state} = DATA_STATE;
1557     $self->{s_kwd} = '';
1558     ## reconsume
1559     !!!emit ($self->{ct}); # end tag
1560     redo A;
1561     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562     !!!parse-error (type => 'unclosed md'); ## TODO: type
1563     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565     ## Reconsume.
1566     !!!emit ($self->{ct}); # ATTLIST
1567     redo A;
1568 wakaba 1.1 } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     } else {
1572     if ({
1573     0x0022 => 1, # "
1574     0x0027 => 1, # '
1575     0x003D => 1, # =
1576 wakaba 1.26 0x003C => 1, # <
1577 wakaba 1.1 }->{$self->{nc}}) {
1578     !!!cp (115);
1579 wakaba 1.11 ## XML5: Not a parse error.
1580 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1581     } else {
1582     !!!cp (116);
1583     }
1584     $self->{ca}->{value} .= chr ($self->{nc});
1585     $self->{read_until}->($self->{ca}->{value},
1586 wakaba 1.25 qq["'=& \x09\x0C>],
1587 wakaba 1.1 length $self->{ca}->{value});
1588    
1589     ## Stay in the state
1590     !!!next-input-character;
1591     redo A;
1592     }
1593     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594     if ($is_space->{$self->{nc}}) {
1595     !!!cp (118);
1596     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == 0x003E) { # >
1600     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601     !!!cp (119);
1602     $self->{last_stag_name} = $self->{ct}->{tag_name};
1603     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605     if ($self->{ct}->{attributes}) {
1606     !!!cp (120);
1607     !!!parse-error (type => 'end tag attribute');
1608     } else {
1609     ## NOTE: This state should never be reached.
1610     !!!cp (121);
1611     }
1612     } else {
1613     die "$0: $self->{ct}->{type}: Unknown token type";
1614     }
1615     $self->{state} = DATA_STATE;
1616 wakaba 1.5 $self->{s_kwd} = '';
1617 wakaba 1.1 !!!next-input-character;
1618    
1619     !!!emit ($self->{ct}); # start tag or end tag
1620    
1621     redo A;
1622     } elsif ($self->{nc} == 0x002F) { # /
1623     !!!cp (122);
1624     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625     !!!next-input-character;
1626     redo A;
1627     } elsif ($self->{nc} == -1) {
1628     !!!parse-error (type => 'unclosed tag');
1629     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630     !!!cp (122.3);
1631     $self->{last_stag_name} = $self->{ct}->{tag_name};
1632     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp (122.1);
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     ## NOTE: This state should never be reached.
1638     !!!cp (122.2);
1639     }
1640     } else {
1641     die "$0: $self->{ct}->{type}: Unknown token type";
1642     }
1643     $self->{state} = DATA_STATE;
1644 wakaba 1.5 $self->{s_kwd} = '';
1645 wakaba 1.1 ## Reconsume.
1646     !!!emit ($self->{ct}); # start tag or end tag
1647     redo A;
1648     } else {
1649     !!!cp ('124.1');
1650     !!!parse-error (type => 'no space between attributes');
1651     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652     ## reconsume
1653     redo A;
1654     }
1655     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 wakaba 1.11 ## XML5: "Empty tag state".
1657    
1658 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1659     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     !!!cp ('124.2');
1661     !!!parse-error (type => 'nestc', token => $self->{ct});
1662     ## TODO: Different type than slash in start tag
1663     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664     if ($self->{ct}->{attributes}) {
1665     !!!cp ('124.4');
1666     !!!parse-error (type => 'end tag attribute');
1667     } else {
1668     !!!cp ('124.5');
1669     }
1670     ## TODO: Test |<title></title/>|
1671     } else {
1672     !!!cp ('124.3');
1673     $self->{self_closing} = 1;
1674     }
1675    
1676     $self->{state} = DATA_STATE;
1677 wakaba 1.5 $self->{s_kwd} = '';
1678 wakaba 1.1 !!!next-input-character;
1679    
1680     !!!emit ($self->{ct}); # start tag or end tag
1681    
1682     redo A;
1683     } elsif ($self->{nc} == -1) {
1684     !!!parse-error (type => 'unclosed tag');
1685     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686     !!!cp (124.7);
1687     $self->{last_stag_name} = $self->{ct}->{tag_name};
1688     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689     if ($self->{ct}->{attributes}) {
1690     !!!cp (124.5);
1691     !!!parse-error (type => 'end tag attribute');
1692     } else {
1693     ## NOTE: This state should never be reached.
1694     !!!cp (124.6);
1695     }
1696     } else {
1697     die "$0: $self->{ct}->{type}: Unknown token type";
1698     }
1699 wakaba 1.11 ## XML5: "Tag attribute name before state".
1700 wakaba 1.1 $self->{state} = DATA_STATE;
1701 wakaba 1.5 $self->{s_kwd} = '';
1702 wakaba 1.1 ## Reconsume.
1703     !!!emit ($self->{ct}); # start tag or end tag
1704     redo A;
1705     } else {
1706     !!!cp ('124.4');
1707     !!!parse-error (type => 'nestc');
1708     ## TODO: This error type is wrong.
1709     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710     ## Reconsume.
1711     redo A;
1712     }
1713     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715    
1716 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717     ## consumes characters one-by-one basis.
1718    
1719     if ($self->{nc} == 0x003E) { # >
1720 wakaba 1.13 if ($self->{in_subset}) {
1721     !!!cp (123);
1722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723     } else {
1724     !!!cp (124);
1725     $self->{state} = DATA_STATE;
1726     $self->{s_kwd} = '';
1727     }
1728 wakaba 1.1 !!!next-input-character;
1729    
1730     !!!emit ($self->{ct}); # comment
1731     redo A;
1732     } elsif ($self->{nc} == -1) {
1733 wakaba 1.13 if ($self->{in_subset}) {
1734     !!!cp (125.1);
1735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736     } else {
1737     !!!cp (125);
1738     $self->{state} = DATA_STATE;
1739     $self->{s_kwd} = '';
1740     }
1741 wakaba 1.1 ## reconsume
1742    
1743     !!!emit ($self->{ct}); # comment
1744     redo A;
1745     } else {
1746     !!!cp (126);
1747     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748     $self->{read_until}->($self->{ct}->{data},
1749     q[>],
1750     length $self->{ct}->{data});
1751    
1752     ## Stay in the state.
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 wakaba 1.14 ## XML5: "Markup declaration state".
1758 wakaba 1.1
1759     if ($self->{nc} == 0x002D) { # -
1760     !!!cp (133);
1761     $self->{state} = MD_HYPHEN_STATE;
1762     !!!next-input-character;
1763     redo A;
1764     } elsif ($self->{nc} == 0x0044 or # D
1765     $self->{nc} == 0x0064) { # d
1766     ## ASCII case-insensitive.
1767     !!!cp (130);
1768     $self->{state} = MD_DOCTYPE_STATE;
1769 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1770 wakaba 1.1 !!!next-input-character;
1771     redo A;
1772 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774     $self->{is_xml}) and
1775 wakaba 1.1 $self->{nc} == 0x005B) { # [
1776     !!!cp (135.4);
1777     $self->{state} = MD_CDATA_STATE;
1778 wakaba 1.12 $self->{kwd} = '[';
1779 wakaba 1.1 !!!next-input-character;
1780     redo A;
1781     } else {
1782     !!!cp (136);
1783     }
1784    
1785     !!!parse-error (type => 'bogus comment',
1786     line => $self->{line_prev},
1787     column => $self->{column_prev} - 1);
1788     ## Reconsume.
1789     $self->{state} = BOGUS_COMMENT_STATE;
1790     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791     line => $self->{line_prev},
1792     column => $self->{column_prev} - 1,
1793     };
1794     redo A;
1795     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796     if ($self->{nc} == 0x002D) { # -
1797     !!!cp (127);
1798     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 2,
1801     };
1802 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 wakaba 1.1 !!!next-input-character;
1804     redo A;
1805     } else {
1806     !!!cp (128);
1807     !!!parse-error (type => 'bogus comment',
1808     line => $self->{line_prev},
1809     column => $self->{column_prev} - 2);
1810     $self->{state} = BOGUS_COMMENT_STATE;
1811     ## Reconsume.
1812     $self->{ct} = {type => COMMENT_TOKEN,
1813     data => '-',
1814     line => $self->{line_prev},
1815     column => $self->{column_prev} - 2,
1816     };
1817     redo A;
1818     }
1819     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820     ## ASCII case-insensitive.
1821     if ($self->{nc} == [
1822     undef,
1823     0x004F, # O
1824     0x0043, # C
1825     0x0054, # T
1826     0x0059, # Y
1827     0x0050, # P
1828 wakaba 1.12 ]->[length $self->{kwd}] or
1829 wakaba 1.1 $self->{nc} == [
1830     undef,
1831     0x006F, # o
1832     0x0063, # c
1833     0x0074, # t
1834     0x0079, # y
1835     0x0070, # p
1836 wakaba 1.12 ]->[length $self->{kwd}]) {
1837 wakaba 1.1 !!!cp (131);
1838     ## Stay in the state.
1839 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1840 wakaba 1.1 !!!next-input-character;
1841     redo A;
1842 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1843 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1844     $self->{nc} == 0x0065)) { # e
1845 wakaba 1.12 if ($self->{is_xml} and
1846     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 wakaba 1.10 !!!cp (129);
1848     ## XML5: case-sensitive.
1849     !!!parse-error (type => 'lowercase keyword', ## TODO
1850     text => 'DOCTYPE',
1851     line => $self->{line_prev},
1852     column => $self->{column_prev} - 5);
1853     } else {
1854     !!!cp (129.1);
1855     }
1856 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1857     $self->{ct} = {type => DOCTYPE_TOKEN,
1858     quirks => 1,
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 7,
1861     };
1862     !!!next-input-character;
1863     redo A;
1864     } else {
1865     !!!cp (132);
1866     !!!parse-error (type => 'bogus comment',
1867     line => $self->{line_prev},
1868 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1870     ## Reconsume.
1871     $self->{ct} = {type => COMMENT_TOKEN,
1872 wakaba 1.12 data => $self->{kwd},
1873 wakaba 1.1 line => $self->{line_prev},
1874 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 wakaba 1.1 };
1876     redo A;
1877     }
1878     } elsif ($self->{state} == MD_CDATA_STATE) {
1879     if ($self->{nc} == {
1880     '[' => 0x0043, # C
1881     '[C' => 0x0044, # D
1882     '[CD' => 0x0041, # A
1883     '[CDA' => 0x0054, # T
1884     '[CDAT' => 0x0041, # A
1885 wakaba 1.12 }->{$self->{kwd}}) {
1886 wakaba 1.1 !!!cp (135.1);
1887     ## Stay in the state.
1888 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1889 wakaba 1.1 !!!next-input-character;
1890     redo A;
1891 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1892 wakaba 1.1 $self->{nc} == 0x005B) { # [
1893 wakaba 1.6 if ($self->{is_xml} and
1894     not $self->{tainted} and
1895     @{$self->{open_elements} or []} == 0) {
1896 wakaba 1.8 !!!cp (135.2);
1897 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1898     line => $self->{line_prev},
1899     column => $self->{column_prev} - 7);
1900     $self->{tainted} = 1;
1901 wakaba 1.8 } else {
1902     !!!cp (135.21);
1903 wakaba 1.6 }
1904    
1905 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1906     data => '',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev} - 7};
1909     $self->{state} = CDATA_SECTION_STATE;
1910     !!!next-input-character;
1911     redo A;
1912     } else {
1913     !!!cp (135.3);
1914     !!!parse-error (type => 'bogus comment',
1915     line => $self->{line_prev},
1916 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1918     ## Reconsume.
1919     $self->{ct} = {type => COMMENT_TOKEN,
1920 wakaba 1.12 data => $self->{kwd},
1921 wakaba 1.1 line => $self->{line_prev},
1922 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 wakaba 1.1 };
1924     redo A;
1925     }
1926     } elsif ($self->{state} == COMMENT_START_STATE) {
1927     if ($self->{nc} == 0x002D) { # -
1928     !!!cp (137);
1929     $self->{state} = COMMENT_START_DASH_STATE;
1930     !!!next-input-character;
1931     redo A;
1932     } elsif ($self->{nc} == 0x003E) { # >
1933     !!!parse-error (type => 'bogus comment');
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (138.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (138);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 !!!next-input-character;
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } elsif ($self->{nc} == -1) {
1948     !!!parse-error (type => 'unclosed comment');
1949 wakaba 1.13 if ($self->{in_subset}) {
1950     !!!cp (139.1);
1951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952     } else {
1953     !!!cp (139);
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     }
1957 wakaba 1.1 ## reconsume
1958    
1959     !!!emit ($self->{ct}); # comment
1960    
1961     redo A;
1962     } else {
1963     !!!cp (140);
1964     $self->{ct}->{data} # comment
1965     .= chr ($self->{nc});
1966     $self->{state} = COMMENT_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     }
1970     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971     if ($self->{nc} == 0x002D) { # -
1972     !!!cp (141);
1973     $self->{state} = COMMENT_END_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} == 0x003E) { # >
1977     !!!parse-error (type => 'bogus comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (142.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (142);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 !!!next-input-character;
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (143.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (143);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (144);
2008     $self->{ct}->{data} # comment
2009     .= '-' . chr ($self->{nc});
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == COMMENT_STATE) {
2015 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2016    
2017 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2018     !!!cp (145);
2019     $self->{state} = COMMENT_END_DASH_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     } elsif ($self->{nc} == -1) {
2023     !!!parse-error (type => 'unclosed comment');
2024 wakaba 1.13 if ($self->{in_subset}) {
2025     !!!cp (146.1);
2026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027     } else {
2028     !!!cp (146);
2029     $self->{state} = DATA_STATE;
2030     $self->{s_kwd} = '';
2031     }
2032 wakaba 1.1 ## reconsume
2033    
2034     !!!emit ($self->{ct}); # comment
2035    
2036     redo A;
2037     } else {
2038     !!!cp (147);
2039     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040     $self->{read_until}->($self->{ct}->{data},
2041     q[-],
2042     length $self->{ct}->{data});
2043    
2044     ## Stay in the state
2045     !!!next-input-character;
2046     redo A;
2047     }
2048     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050 wakaba 1.10
2051 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2052     !!!cp (148);
2053     $self->{state} = COMMENT_END_STATE;
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == -1) {
2057     !!!parse-error (type => 'unclosed comment');
2058 wakaba 1.13 if ($self->{in_subset}) {
2059     !!!cp (149.1);
2060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061     } else {
2062     !!!cp (149);
2063     $self->{state} = DATA_STATE;
2064     $self->{s_kwd} = '';
2065     }
2066 wakaba 1.1 ## reconsume
2067    
2068     !!!emit ($self->{ct}); # comment
2069    
2070     redo A;
2071     } else {
2072     !!!cp (150);
2073     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074     $self->{state} = COMMENT_STATE;
2075     !!!next-input-character;
2076     redo A;
2077     }
2078     } elsif ($self->{state} == COMMENT_END_STATE) {
2079 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080    
2081 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2082 wakaba 1.13 if ($self->{in_subset}) {
2083     !!!cp (151.1);
2084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085     } else {
2086     !!!cp (151);
2087     $self->{state} = DATA_STATE;
2088     $self->{s_kwd} = '';
2089     }
2090 wakaba 1.1 !!!next-input-character;
2091    
2092     !!!emit ($self->{ct}); # comment
2093    
2094     redo A;
2095     } elsif ($self->{nc} == 0x002D) { # -
2096     !!!cp (152);
2097 wakaba 1.10 ## XML5: Not a parse error.
2098 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev});
2101     $self->{ct}->{data} .= '-'; # comment
2102     ## Stay in the state
2103     !!!next-input-character;
2104     redo A;
2105     } elsif ($self->{nc} == -1) {
2106     !!!parse-error (type => 'unclosed comment');
2107 wakaba 1.13 if ($self->{in_subset}) {
2108     !!!cp (153.1);
2109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110     } else {
2111     !!!cp (153);
2112     $self->{state} = DATA_STATE;
2113     $self->{s_kwd} = '';
2114     }
2115 wakaba 1.1 ## reconsume
2116    
2117     !!!emit ($self->{ct}); # comment
2118    
2119     redo A;
2120     } else {
2121     !!!cp (154);
2122     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123     $self->{state} = COMMENT_STATE;
2124     !!!next-input-character;
2125     redo A;
2126     }
2127     } elsif ($self->{state} == DOCTYPE_STATE) {
2128     if ($is_space->{$self->{nc}}) {
2129     !!!cp (155);
2130     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2131     !!!next-input-character;
2132     redo A;
2133     } else {
2134     !!!cp (156);
2135 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2136 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2137     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2138     ## reconsume
2139     redo A;
2140     }
2141     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2142 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2143    
2144 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2145     !!!cp (157);
2146     ## Stay in the state
2147     !!!next-input-character;
2148     redo A;
2149     } elsif ($self->{nc} == 0x003E) { # >
2150     !!!cp (158);
2151 wakaba 1.12 ## XML5: No parse error.
2152 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2153     $self->{state} = DATA_STATE;
2154 wakaba 1.5 $self->{s_kwd} = '';
2155 wakaba 1.1 !!!next-input-character;
2156    
2157     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2158    
2159     redo A;
2160     } elsif ($self->{nc} == -1) {
2161     !!!cp (159);
2162     !!!parse-error (type => 'no DOCTYPE name');
2163     $self->{state} = DATA_STATE;
2164 wakaba 1.5 $self->{s_kwd} = '';
2165 wakaba 1.1 ## reconsume
2166    
2167     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2168    
2169     redo A;
2170 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2171     !!!cp (159.1);
2172     !!!parse-error (type => 'no DOCTYPE name');
2173     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2174 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2175     $self->{in_subset} = 1;
2176 wakaba 1.12 !!!next-input-character;
2177 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2178 wakaba 1.12 redo A;
2179 wakaba 1.1 } else {
2180     !!!cp (160);
2181     $self->{ct}->{name} = chr $self->{nc};
2182     delete $self->{ct}->{quirks};
2183     $self->{state} = DOCTYPE_NAME_STATE;
2184     !!!next-input-character;
2185     redo A;
2186     }
2187     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2188 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2189    
2190     ## ISSUE: Redundant "First," in the spec.
2191    
2192 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2193     !!!cp (161);
2194     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2195     !!!next-input-character;
2196     redo A;
2197     } elsif ($self->{nc} == 0x003E) { # >
2198     !!!cp (162);
2199     $self->{state} = DATA_STATE;
2200 wakaba 1.5 $self->{s_kwd} = '';
2201 wakaba 1.1 !!!next-input-character;
2202    
2203     !!!emit ($self->{ct}); # DOCTYPE
2204    
2205     redo A;
2206     } elsif ($self->{nc} == -1) {
2207     !!!cp (163);
2208     !!!parse-error (type => 'unclosed DOCTYPE');
2209     $self->{state} = DATA_STATE;
2210 wakaba 1.5 $self->{s_kwd} = '';
2211 wakaba 1.1 ## reconsume
2212    
2213     $self->{ct}->{quirks} = 1;
2214     !!!emit ($self->{ct}); # DOCTYPE
2215    
2216     redo A;
2217 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2218     !!!cp (163.1);
2219     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2220 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2221     $self->{in_subset} = 1;
2222 wakaba 1.12 !!!next-input-character;
2223 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2224 wakaba 1.12 redo A;
2225 wakaba 1.1 } else {
2226     !!!cp (164);
2227     $self->{ct}->{name}
2228     .= chr ($self->{nc}); # DOCTYPE
2229     ## Stay in the state
2230     !!!next-input-character;
2231     redo A;
2232     }
2233     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2234 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2235     ## state", but implemented differently.
2236    
2237 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2238     !!!cp (165);
2239     ## Stay in the state
2240     !!!next-input-character;
2241     redo A;
2242     } elsif ($self->{nc} == 0x003E) { # >
2243 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2244     !!!cp (166);
2245     $self->{state} = DATA_STATE;
2246     $self->{s_kwd} = '';
2247     } else {
2248     !!!cp (166.1);
2249     !!!parse-error (type => 'no md def'); ## TODO: type
2250     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2251     }
2252    
2253 wakaba 1.1 !!!next-input-character;
2254 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2255 wakaba 1.1 redo A;
2256     } elsif ($self->{nc} == -1) {
2257 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2258     !!!cp (167);
2259     !!!parse-error (type => 'unclosed DOCTYPE');
2260     $self->{state} = DATA_STATE;
2261     $self->{s_kwd} = '';
2262     $self->{ct}->{quirks} = 1;
2263     } else {
2264     !!!cp (167.12);
2265     !!!parse-error (type => 'unclosed md'); ## TODO: type
2266     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2267     }
2268    
2269     ## Reconsume.
2270     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2271 wakaba 1.1 redo A;
2272     } elsif ($self->{nc} == 0x0050 or # P
2273     $self->{nc} == 0x0070) { # p
2274 wakaba 1.12 !!!cp (167.1);
2275 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2276 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2277 wakaba 1.1 !!!next-input-character;
2278     redo A;
2279     } elsif ($self->{nc} == 0x0053 or # S
2280     $self->{nc} == 0x0073) { # s
2281 wakaba 1.12 !!!cp (167.2);
2282 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2283 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2284     !!!next-input-character;
2285     redo A;
2286 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2287     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2288     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2289     !!!cp (167.21);
2290     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2291     $self->{ct}->{value} = ''; # ENTITY
2292     !!!next-input-character;
2293     redo A;
2294     } elsif ($self->{nc} == 0x0027 and # '
2295     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2296     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2297     !!!cp (167.22);
2298     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2299     $self->{ct}->{value} = ''; # ENTITY
2300     !!!next-input-character;
2301     redo A;
2302 wakaba 1.16 } elsif ($self->{is_xml} and
2303     $self->{ct}->{type} == DOCTYPE_TOKEN and
2304     $self->{nc} == 0x005B) { # [
2305 wakaba 1.12 !!!cp (167.3);
2306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2307     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2308 wakaba 1.13 $self->{in_subset} = 1;
2309 wakaba 1.1 !!!next-input-character;
2310 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2311 wakaba 1.1 redo A;
2312     } else {
2313 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2314    
2315     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2316     !!!cp (180);
2317     $self->{ct}->{quirks} = 1;
2318     $self->{state} = BOGUS_DOCTYPE_STATE;
2319     } else {
2320     !!!cp (180.1);
2321     $self->{state} = BOGUS_MD_STATE;
2322     }
2323 wakaba 1.1
2324     !!!next-input-character;
2325     redo A;
2326     }
2327     } elsif ($self->{state} == PUBLIC_STATE) {
2328     ## ASCII case-insensitive
2329     if ($self->{nc} == [
2330     undef,
2331     0x0055, # U
2332     0x0042, # B
2333     0x004C, # L
2334     0x0049, # I
2335 wakaba 1.12 ]->[length $self->{kwd}] or
2336 wakaba 1.1 $self->{nc} == [
2337     undef,
2338     0x0075, # u
2339     0x0062, # b
2340     0x006C, # l
2341     0x0069, # i
2342 wakaba 1.12 ]->[length $self->{kwd}]) {
2343 wakaba 1.1 !!!cp (175);
2344     ## Stay in the state.
2345 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2346 wakaba 1.1 !!!next-input-character;
2347     redo A;
2348 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2349 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2350     $self->{nc} == 0x0063)) { # c
2351 wakaba 1.12 if ($self->{is_xml} and
2352     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2353     !!!cp (168.1);
2354     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2355     text => 'PUBLIC',
2356     line => $self->{line_prev},
2357     column => $self->{column_prev} - 4);
2358     } else {
2359     !!!cp (168);
2360     }
2361 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2362     !!!next-input-character;
2363     redo A;
2364     } else {
2365 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2366 wakaba 1.1 line => $self->{line_prev},
2367 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2368 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2369     !!!cp (169);
2370     $self->{ct}->{quirks} = 1;
2371     $self->{state} = BOGUS_DOCTYPE_STATE;
2372     } else {
2373     !!!cp (169.1);
2374     $self->{state} = BOGUS_MD_STATE;
2375     }
2376 wakaba 1.1 ## Reconsume.
2377     redo A;
2378     }
2379     } elsif ($self->{state} == SYSTEM_STATE) {
2380     ## ASCII case-insensitive
2381     if ($self->{nc} == [
2382     undef,
2383     0x0059, # Y
2384     0x0053, # S
2385     0x0054, # T
2386     0x0045, # E
2387 wakaba 1.12 ]->[length $self->{kwd}] or
2388 wakaba 1.1 $self->{nc} == [
2389     undef,
2390     0x0079, # y
2391     0x0073, # s
2392     0x0074, # t
2393     0x0065, # e
2394 wakaba 1.12 ]->[length $self->{kwd}]) {
2395 wakaba 1.1 !!!cp (170);
2396     ## Stay in the state.
2397 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2398 wakaba 1.1 !!!next-input-character;
2399     redo A;
2400 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2401 wakaba 1.1 ($self->{nc} == 0x004D or # M
2402     $self->{nc} == 0x006D)) { # m
2403 wakaba 1.12 if ($self->{is_xml} and
2404     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2405     !!!cp (171.1);
2406     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2407     text => 'SYSTEM',
2408     line => $self->{line_prev},
2409     column => $self->{column_prev} - 4);
2410     } else {
2411     !!!cp (171);
2412     }
2413 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2414     !!!next-input-character;
2415     redo A;
2416     } else {
2417 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2418 wakaba 1.1 line => $self->{line_prev},
2419 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2420 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2421     !!!cp (172);
2422     $self->{ct}->{quirks} = 1;
2423     $self->{state} = BOGUS_DOCTYPE_STATE;
2424     } else {
2425     !!!cp (172.1);
2426     $self->{state} = BOGUS_MD_STATE;
2427     }
2428 wakaba 1.1 ## Reconsume.
2429     redo A;
2430     }
2431     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2432     if ($is_space->{$self->{nc}}) {
2433     !!!cp (181);
2434     ## Stay in the state
2435     !!!next-input-character;
2436     redo A;
2437     } elsif ($self->{nc} eq 0x0022) { # "
2438     !!!cp (182);
2439     $self->{ct}->{pubid} = ''; # DOCTYPE
2440     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2441     !!!next-input-character;
2442     redo A;
2443     } elsif ($self->{nc} eq 0x0027) { # '
2444     !!!cp (183);
2445     $self->{ct}->{pubid} = ''; # DOCTYPE
2446     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2447     !!!next-input-character;
2448     redo A;
2449     } elsif ($self->{nc} eq 0x003E) { # >
2450     !!!parse-error (type => 'no PUBLIC literal');
2451 wakaba 1.16
2452     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2453     !!!cp (184);
2454     $self->{state} = DATA_STATE;
2455     $self->{s_kwd} = '';
2456     $self->{ct}->{quirks} = 1;
2457     } else {
2458     !!!cp (184.1);
2459     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2460     }
2461    
2462 wakaba 1.1 !!!next-input-character;
2463 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2464 wakaba 1.1 redo A;
2465     } elsif ($self->{nc} == -1) {
2466 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2467     !!!cp (185);
2468     !!!parse-error (type => 'unclosed DOCTYPE');
2469     $self->{state} = DATA_STATE;
2470     $self->{s_kwd} = '';
2471     $self->{ct}->{quirks} = 1;
2472     } else {
2473     !!!cp (185.1);
2474     !!!parse-error (type => 'unclosed md'); ## TODO: type
2475     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2476     }
2477    
2478 wakaba 1.1 ## reconsume
2479     !!!emit ($self->{ct}); # DOCTYPE
2480     redo A;
2481 wakaba 1.16 } elsif ($self->{is_xml} and
2482     $self->{ct}->{type} == DOCTYPE_TOKEN and
2483     $self->{nc} == 0x005B) { # [
2484 wakaba 1.12 !!!cp (186.1);
2485     !!!parse-error (type => 'no PUBLIC literal');
2486     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2487     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2488 wakaba 1.13 $self->{in_subset} = 1;
2489 wakaba 1.12 !!!next-input-character;
2490 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2491 wakaba 1.12 redo A;
2492 wakaba 1.1 } else {
2493     !!!parse-error (type => 'string after PUBLIC');
2494    
2495 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2496     !!!cp (186);
2497     $self->{ct}->{quirks} = 1;
2498     $self->{state} = BOGUS_DOCTYPE_STATE;
2499     } else {
2500     !!!cp (186.2);
2501     $self->{state} = BOGUS_MD_STATE;
2502     }
2503    
2504 wakaba 1.1 !!!next-input-character;
2505     redo A;
2506     }
2507     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2508     if ($self->{nc} == 0x0022) { # "
2509     !!!cp (187);
2510     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2511     !!!next-input-character;
2512     redo A;
2513     } elsif ($self->{nc} == 0x003E) { # >
2514     !!!parse-error (type => 'unclosed PUBLIC literal');
2515    
2516 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2517     !!!cp (188);
2518     $self->{state} = DATA_STATE;
2519     $self->{s_kwd} = '';
2520     $self->{ct}->{quirks} = 1;
2521     } else {
2522     !!!cp (188.1);
2523     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2524     }
2525    
2526 wakaba 1.1 !!!next-input-character;
2527 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2528 wakaba 1.1 redo A;
2529     } elsif ($self->{nc} == -1) {
2530     !!!parse-error (type => 'unclosed PUBLIC literal');
2531    
2532 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2533     !!!cp (189);
2534     $self->{state} = DATA_STATE;
2535     $self->{s_kwd} = '';
2536     $self->{ct}->{quirks} = 1;
2537     } else {
2538     !!!cp (189.1);
2539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2540     }
2541    
2542     ## Reconsume.
2543 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2544     redo A;
2545     } else {
2546     !!!cp (190);
2547 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2548 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2549     length $self->{ct}->{pubid});
2550    
2551     ## Stay in the state
2552     !!!next-input-character;
2553     redo A;
2554     }
2555     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2556     if ($self->{nc} == 0x0027) { # '
2557     !!!cp (191);
2558     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2559     !!!next-input-character;
2560     redo A;
2561     } elsif ($self->{nc} == 0x003E) { # >
2562     !!!parse-error (type => 'unclosed PUBLIC literal');
2563    
2564 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2565     !!!cp (192);
2566     $self->{state} = DATA_STATE;
2567     $self->{s_kwd} = '';
2568     $self->{ct}->{quirks} = 1;
2569     } else {
2570     !!!cp (192.1);
2571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2572     }
2573    
2574 wakaba 1.1 !!!next-input-character;
2575 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2576 wakaba 1.1 redo A;
2577     } elsif ($self->{nc} == -1) {
2578     !!!parse-error (type => 'unclosed PUBLIC literal');
2579    
2580 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2581     !!!cp (193);
2582     $self->{state} = DATA_STATE;
2583     $self->{s_kwd} = '';
2584     $self->{ct}->{quirks} = 1;
2585     } else {
2586     !!!cp (193.1);
2587     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2588     }
2589    
2590 wakaba 1.1 ## reconsume
2591 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2592 wakaba 1.1 redo A;
2593     } else {
2594     !!!cp (194);
2595 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2596 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2597     length $self->{ct}->{pubid});
2598    
2599     ## Stay in the state
2600     !!!next-input-character;
2601     redo A;
2602     }
2603     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2604     if ($is_space->{$self->{nc}}) {
2605     !!!cp (195);
2606     ## Stay in the state
2607     !!!next-input-character;
2608     redo A;
2609     } elsif ($self->{nc} == 0x0022) { # "
2610     !!!cp (196);
2611 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2612 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2613     !!!next-input-character;
2614     redo A;
2615     } elsif ($self->{nc} == 0x0027) { # '
2616     !!!cp (197);
2617 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2618 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2619     !!!next-input-character;
2620     redo A;
2621     } elsif ($self->{nc} == 0x003E) { # >
2622 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2623     if ($self->{is_xml}) {
2624     !!!cp (198.1);
2625     !!!parse-error (type => 'no SYSTEM literal');
2626     } else {
2627     !!!cp (198);
2628     }
2629     $self->{state} = DATA_STATE;
2630     $self->{s_kwd} = '';
2631 wakaba 1.12 } else {
2632 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2633     !!!cp (198.2);
2634     } else {
2635     !!!cp (198.3);
2636     !!!parse-error (type => 'no SYSTEM literal');
2637     }
2638     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2639 wakaba 1.12 }
2640 wakaba 1.16
2641 wakaba 1.1 !!!next-input-character;
2642 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2643 wakaba 1.1 redo A;
2644     } elsif ($self->{nc} == -1) {
2645 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2646     !!!cp (199);
2647     !!!parse-error (type => 'unclosed DOCTYPE');
2648    
2649     $self->{state} = DATA_STATE;
2650     $self->{s_kwd} = '';
2651     $self->{ct}->{quirks} = 1;
2652     } else {
2653     !!!parse-error (type => 'unclosed md'); ## TODO: type
2654     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655     }
2656    
2657 wakaba 1.1 ## reconsume
2658 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2659 wakaba 1.1 redo A;
2660 wakaba 1.16 } elsif ($self->{is_xml} and
2661     $self->{ct}->{type} == DOCTYPE_TOKEN and
2662     $self->{nc} == 0x005B) { # [
2663 wakaba 1.12 !!!cp (200.1);
2664     !!!parse-error (type => 'no SYSTEM literal');
2665     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2666     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2667 wakaba 1.13 $self->{in_subset} = 1;
2668 wakaba 1.12 !!!next-input-character;
2669 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2670 wakaba 1.12 redo A;
2671 wakaba 1.1 } else {
2672     !!!parse-error (type => 'string after PUBLIC literal');
2673    
2674 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2675     !!!cp (200);
2676     $self->{ct}->{quirks} = 1;
2677     $self->{state} = BOGUS_DOCTYPE_STATE;
2678     } else {
2679     !!!cp (200.2);
2680     $self->{state} = BOGUS_MD_STATE;
2681     }
2682    
2683 wakaba 1.1 !!!next-input-character;
2684     redo A;
2685     }
2686     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2687     if ($is_space->{$self->{nc}}) {
2688     !!!cp (201);
2689     ## Stay in the state
2690     !!!next-input-character;
2691     redo A;
2692     } elsif ($self->{nc} == 0x0022) { # "
2693     !!!cp (202);
2694     $self->{ct}->{sysid} = ''; # DOCTYPE
2695     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2696     !!!next-input-character;
2697     redo A;
2698     } elsif ($self->{nc} == 0x0027) { # '
2699     !!!cp (203);
2700     $self->{ct}->{sysid} = ''; # DOCTYPE
2701     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2702     !!!next-input-character;
2703     redo A;
2704     } elsif ($self->{nc} == 0x003E) { # >
2705     !!!parse-error (type => 'no SYSTEM literal');
2706     !!!next-input-character;
2707    
2708 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2709     !!!cp (204);
2710     $self->{state} = DATA_STATE;
2711     $self->{s_kwd} = '';
2712     $self->{ct}->{quirks} = 1;
2713     } else {
2714     !!!cp (204.1);
2715     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2716     }
2717 wakaba 1.1
2718 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2719 wakaba 1.1 redo A;
2720     } elsif ($self->{nc} == -1) {
2721 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2722     !!!cp (205);
2723     !!!parse-error (type => 'unclosed DOCTYPE');
2724     $self->{state} = DATA_STATE;
2725     $self->{s_kwd} = '';
2726     $self->{ct}->{quirks} = 1;
2727     } else {
2728     !!!cp (205.1);
2729     !!!parse-error (type => 'unclosed md'); ## TODO: type
2730     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731     }
2732    
2733 wakaba 1.1 ## reconsume
2734 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2735 wakaba 1.1 redo A;
2736 wakaba 1.16 } elsif ($self->{is_xml} and
2737     $self->{ct}->{type} == DOCTYPE_TOKEN and
2738     $self->{nc} == 0x005B) { # [
2739 wakaba 1.12 !!!cp (206.1);
2740     !!!parse-error (type => 'no SYSTEM literal');
2741    
2742     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2743     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2744 wakaba 1.13 $self->{in_subset} = 1;
2745 wakaba 1.12 !!!next-input-character;
2746 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2747 wakaba 1.12 redo A;
2748 wakaba 1.1 } else {
2749     !!!parse-error (type => 'string after SYSTEM');
2750    
2751 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2752     !!!cp (206);
2753     $self->{ct}->{quirks} = 1;
2754     $self->{state} = BOGUS_DOCTYPE_STATE;
2755     } else {
2756     !!!cp (206.2);
2757     $self->{state} = BOGUS_MD_STATE;
2758     }
2759    
2760 wakaba 1.1 !!!next-input-character;
2761     redo A;
2762     }
2763     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2764     if ($self->{nc} == 0x0022) { # "
2765     !!!cp (207);
2766     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2767     !!!next-input-character;
2768     redo A;
2769 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2770 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2771    
2772 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2773     !!!cp (208);
2774     $self->{state} = DATA_STATE;
2775     $self->{s_kwd} = '';
2776     $self->{ct}->{quirks} = 1;
2777     } else {
2778     !!!cp (208.1);
2779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780     }
2781    
2782 wakaba 1.1 !!!next-input-character;
2783 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2784 wakaba 1.1 redo A;
2785     } elsif ($self->{nc} == -1) {
2786     !!!parse-error (type => 'unclosed SYSTEM literal');
2787    
2788 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2789     !!!cp (209);
2790     $self->{state} = DATA_STATE;
2791     $self->{s_kwd} = '';
2792     $self->{ct}->{quirks} = 1;
2793     } else {
2794     !!!cp (209.1);
2795     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2796     }
2797    
2798 wakaba 1.1 ## reconsume
2799 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2800 wakaba 1.1 redo A;
2801     } else {
2802     !!!cp (210);
2803 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2804 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2805     length $self->{ct}->{sysid});
2806    
2807     ## Stay in the state
2808     !!!next-input-character;
2809     redo A;
2810     }
2811     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2812     if ($self->{nc} == 0x0027) { # '
2813     !!!cp (211);
2814     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2815     !!!next-input-character;
2816     redo A;
2817 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2818 wakaba 1.1 !!!cp (212);
2819     !!!parse-error (type => 'unclosed SYSTEM literal');
2820    
2821     $self->{state} = DATA_STATE;
2822 wakaba 1.5 $self->{s_kwd} = '';
2823 wakaba 1.1 !!!next-input-character;
2824    
2825     $self->{ct}->{quirks} = 1;
2826     !!!emit ($self->{ct}); # DOCTYPE
2827    
2828     redo A;
2829     } elsif ($self->{nc} == -1) {
2830     !!!parse-error (type => 'unclosed SYSTEM literal');
2831    
2832 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2833     !!!cp (213);
2834     $self->{state} = DATA_STATE;
2835     $self->{s_kwd} = '';
2836     $self->{ct}->{quirks} = 1;
2837     } else {
2838     !!!cp (213.1);
2839     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2840     }
2841    
2842 wakaba 1.1 ## reconsume
2843 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2844 wakaba 1.1 redo A;
2845     } else {
2846     !!!cp (214);
2847 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2848 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2849     length $self->{ct}->{sysid});
2850    
2851     ## Stay in the state
2852     !!!next-input-character;
2853     redo A;
2854     }
2855     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2856     if ($is_space->{$self->{nc}}) {
2857 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2858     !!!cp (215.1);
2859     $self->{state} = BEFORE_NDATA_STATE;
2860     } else {
2861     !!!cp (215);
2862     ## Stay in the state
2863     }
2864 wakaba 1.1 !!!next-input-character;
2865     redo A;
2866     } elsif ($self->{nc} == 0x003E) { # >
2867 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2868     !!!cp (216);
2869     $self->{state} = DATA_STATE;
2870     $self->{s_kwd} = '';
2871     } else {
2872     !!!cp (216.1);
2873     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2874     }
2875    
2876 wakaba 1.1 !!!next-input-character;
2877 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2878 wakaba 1.1 redo A;
2879 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2880     ($self->{nc} == 0x004E or # N
2881     $self->{nc} == 0x006E)) { # n
2882     !!!cp (216.2);
2883     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2884     $self->{state} = NDATA_STATE;
2885     $self->{kwd} = chr $self->{nc};
2886     !!!next-input-character;
2887     redo A;
2888 wakaba 1.1 } elsif ($self->{nc} == -1) {
2889 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2890     !!!cp (217);
2891     !!!parse-error (type => 'unclosed DOCTYPE');
2892     $self->{state} = DATA_STATE;
2893     $self->{s_kwd} = '';
2894     $self->{ct}->{quirks} = 1;
2895     } else {
2896     !!!cp (217.1);
2897     !!!parse-error (type => 'unclosed md'); ## TODO: type
2898     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2899     }
2900    
2901 wakaba 1.1 ## reconsume
2902 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2903 wakaba 1.1 redo A;
2904 wakaba 1.16 } elsif ($self->{is_xml} and
2905     $self->{ct}->{type} == DOCTYPE_TOKEN and
2906     $self->{nc} == 0x005B) { # [
2907 wakaba 1.12 !!!cp (218.1);
2908     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2909     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2910 wakaba 1.13 $self->{in_subset} = 1;
2911 wakaba 1.12 !!!next-input-character;
2912 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2913 wakaba 1.12 redo A;
2914 wakaba 1.1 } else {
2915     !!!parse-error (type => 'string after SYSTEM literal');
2916    
2917 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2918     !!!cp (218);
2919     #$self->{ct}->{quirks} = 1;
2920     $self->{state} = BOGUS_DOCTYPE_STATE;
2921     } else {
2922     !!!cp (218.2);
2923     $self->{state} = BOGUS_MD_STATE;
2924     }
2925    
2926 wakaba 1.1 !!!next-input-character;
2927     redo A;
2928     }
2929 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2930     if ($is_space->{$self->{nc}}) {
2931     !!!cp (218.3);
2932     ## Stay in the state.
2933     !!!next-input-character;
2934     redo A;
2935     } elsif ($self->{nc} == 0x003E) { # >
2936     !!!cp (218.4);
2937     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2938     !!!next-input-character;
2939     !!!emit ($self->{ct}); # ENTITY
2940     redo A;
2941     } elsif ($self->{nc} == 0x004E or # N
2942     $self->{nc} == 0x006E) { # n
2943     !!!cp (218.5);
2944     $self->{state} = NDATA_STATE;
2945     $self->{kwd} = chr $self->{nc};
2946     !!!next-input-character;
2947     redo A;
2948     } elsif ($self->{nc} == -1) {
2949     !!!cp (218.6);
2950     !!!parse-error (type => 'unclosed md'); ## TODO: type
2951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2952     ## reconsume
2953     !!!emit ($self->{ct}); # ENTITY
2954     redo A;
2955     } else {
2956     !!!cp (218.7);
2957     !!!parse-error (type => 'string after SYSTEM literal');
2958     $self->{state} = BOGUS_MD_STATE;
2959     !!!next-input-character;
2960     redo A;
2961     }
2962 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2963     if ($self->{nc} == 0x003E) { # >
2964     !!!cp (219);
2965     $self->{state} = DATA_STATE;
2966 wakaba 1.5 $self->{s_kwd} = '';
2967 wakaba 1.1 !!!next-input-character;
2968    
2969     !!!emit ($self->{ct}); # DOCTYPE
2970    
2971     redo A;
2972 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2973 wakaba 1.13 !!!cp (220.1);
2974     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2975     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2976     $self->{in_subset} = 1;
2977     !!!next-input-character;
2978     !!!emit ($self->{ct}); # DOCTYPE
2979     redo A;
2980 wakaba 1.1 } elsif ($self->{nc} == -1) {
2981     !!!cp (220);
2982     $self->{state} = DATA_STATE;
2983 wakaba 1.5 $self->{s_kwd} = '';
2984 wakaba 1.1 ## reconsume
2985    
2986     !!!emit ($self->{ct}); # DOCTYPE
2987    
2988     redo A;
2989     } else {
2990     !!!cp (221);
2991     my $s = '';
2992 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2993 wakaba 1.1
2994     ## Stay in the state
2995     !!!next-input-character;
2996     redo A;
2997     }
2998     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2999     ## NOTE: "CDATA section state" in the state is jointly implemented
3000     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3001     ## and |CDATA_SECTION_MSE2_STATE|.
3002 wakaba 1.10
3003     ## XML5: "CDATA state".
3004 wakaba 1.1
3005     if ($self->{nc} == 0x005D) { # ]
3006     !!!cp (221.1);
3007     $self->{state} = CDATA_SECTION_MSE1_STATE;
3008     !!!next-input-character;
3009     redo A;
3010     } elsif ($self->{nc} == -1) {
3011 wakaba 1.6 if ($self->{is_xml}) {
3012 wakaba 1.8 !!!cp (221.11);
3013 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3014 wakaba 1.8 } else {
3015     !!!cp (221.12);
3016 wakaba 1.6 }
3017    
3018 wakaba 1.1 $self->{state} = DATA_STATE;
3019 wakaba 1.5 $self->{s_kwd} = '';
3020 wakaba 1.10 ## Reconsume.
3021 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3022     !!!cp (221.2);
3023     !!!emit ($self->{ct}); # character
3024     } else {
3025     !!!cp (221.3);
3026     ## No token to emit. $self->{ct} is discarded.
3027     }
3028     redo A;
3029     } else {
3030     !!!cp (221.4);
3031     $self->{ct}->{data} .= chr $self->{nc};
3032     $self->{read_until}->($self->{ct}->{data},
3033     q<]>,
3034     length $self->{ct}->{data});
3035    
3036     ## Stay in the state.
3037     !!!next-input-character;
3038     redo A;
3039     }
3040    
3041     ## ISSUE: "text tokens" in spec.
3042     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3043 wakaba 1.10 ## XML5: "CDATA bracket state".
3044    
3045 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3046     !!!cp (221.5);
3047     $self->{state} = CDATA_SECTION_MSE2_STATE;
3048     !!!next-input-character;
3049     redo A;
3050     } else {
3051     !!!cp (221.6);
3052 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3053 wakaba 1.1 $self->{ct}->{data} .= ']';
3054 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3055 wakaba 1.1 ## Reconsume.
3056     redo A;
3057     }
3058     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3059 wakaba 1.10 ## XML5: "CDATA end state".
3060    
3061 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3062     $self->{state} = DATA_STATE;
3063 wakaba 1.5 $self->{s_kwd} = '';
3064 wakaba 1.1 !!!next-input-character;
3065     if (length $self->{ct}->{data}) { # character
3066     !!!cp (221.7);
3067     !!!emit ($self->{ct}); # character
3068     } else {
3069     !!!cp (221.8);
3070     ## No token to emit. $self->{ct} is discarded.
3071     }
3072     redo A;
3073     } elsif ($self->{nc} == 0x005D) { # ]
3074     !!!cp (221.9); # character
3075     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3076     ## Stay in the state.
3077     !!!next-input-character;
3078     redo A;
3079     } else {
3080     !!!cp (221.11);
3081     $self->{ct}->{data} .= ']]'; # character
3082     $self->{state} = CDATA_SECTION_STATE;
3083 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3084 wakaba 1.1 redo A;
3085     }
3086     } elsif ($self->{state} == ENTITY_STATE) {
3087     if ($is_space->{$self->{nc}} or
3088     {
3089     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3090     $self->{entity_add} => 1,
3091     }->{$self->{nc}}) {
3092 wakaba 1.22 if ($self->{is_xml}) {
3093     !!!cp (1001.1);
3094     !!!parse-error (type => 'bare ero',
3095     line => $self->{line_prev},
3096     column => $self->{column_prev}
3097     + ($self->{nc} == -1 ? 1 : 0));
3098     } else {
3099     !!!cp (1001);
3100     ## No error
3101     }
3102 wakaba 1.1 ## Don't consume
3103     ## Return nothing.
3104     #
3105     } elsif ($self->{nc} == 0x0023) { # #
3106     !!!cp (999);
3107     $self->{state} = ENTITY_HASH_STATE;
3108 wakaba 1.12 $self->{kwd} = '#';
3109 wakaba 1.1 !!!next-input-character;
3110     redo A;
3111 wakaba 1.22 } elsif ($self->{is_xml} or
3112     (0x0041 <= $self->{nc} and
3113 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3114     (0x0061 <= $self->{nc} and
3115     $self->{nc} <= 0x007A)) { # a..z
3116     !!!cp (998);
3117     require Whatpm::_NamedEntityList;
3118     $self->{state} = ENTITY_NAME_STATE;
3119 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3120     $self->{entity__value} = $self->{kwd};
3121 wakaba 1.1 $self->{entity__match} = 0;
3122     !!!next-input-character;
3123     redo A;
3124     } else {
3125     !!!cp (1027);
3126     !!!parse-error (type => 'bare ero');
3127     ## Return nothing.
3128     #
3129     }
3130    
3131     ## NOTE: No character is consumed by the "consume a character
3132     ## reference" algorithm. In other word, there is an "&" character
3133     ## that does not introduce a character reference, which would be
3134     ## appended to the parent element or the attribute value in later
3135     ## process of the tokenizer.
3136    
3137     if ($self->{prev_state} == DATA_STATE) {
3138     !!!cp (997);
3139     $self->{state} = $self->{prev_state};
3140 wakaba 1.5 $self->{s_kwd} = '';
3141 wakaba 1.1 ## Reconsume.
3142     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3143     line => $self->{line_prev},
3144     column => $self->{column_prev},
3145     });
3146     redo A;
3147     } else {
3148     !!!cp (996);
3149     $self->{ca}->{value} .= '&';
3150     $self->{state} = $self->{prev_state};
3151 wakaba 1.5 $self->{s_kwd} = '';
3152 wakaba 1.1 ## Reconsume.
3153     redo A;
3154     }
3155     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3156 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3157 wakaba 1.1 !!!cp (995);
3158     $self->{state} = HEXREF_X_STATE;
3159 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3160 wakaba 1.1 !!!next-input-character;
3161     redo A;
3162 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3163     !!!cp (995.1);
3164     if ($self->{is_xml}) {
3165     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3166     }
3167     $self->{state} = HEXREF_X_STATE;
3168     $self->{kwd} .= chr $self->{nc};
3169     !!!next-input-character;
3170     redo A;
3171 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3172     $self->{nc} <= 0x0039) { # 0..9
3173     !!!cp (994);
3174     $self->{state} = NCR_NUM_STATE;
3175 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3176 wakaba 1.1 !!!next-input-character;
3177     redo A;
3178     } else {
3179     !!!parse-error (type => 'bare nero',
3180     line => $self->{line_prev},
3181     column => $self->{column_prev} - 1);
3182    
3183     ## NOTE: According to the spec algorithm, nothing is returned,
3184     ## and then "&#" is appended to the parent element or the attribute
3185     ## value in the later processing.
3186    
3187     if ($self->{prev_state} == DATA_STATE) {
3188     !!!cp (1019);
3189     $self->{state} = $self->{prev_state};
3190 wakaba 1.5 $self->{s_kwd} = '';
3191 wakaba 1.1 ## Reconsume.
3192     !!!emit ({type => CHARACTER_TOKEN,
3193     data => '&#',
3194     line => $self->{line_prev},
3195     column => $self->{column_prev} - 1,
3196     });
3197     redo A;
3198     } else {
3199     !!!cp (993);
3200     $self->{ca}->{value} .= '&#';
3201     $self->{state} = $self->{prev_state};
3202 wakaba 1.5 $self->{s_kwd} = '';
3203 wakaba 1.1 ## Reconsume.
3204     redo A;
3205     }
3206     }
3207     } elsif ($self->{state} == NCR_NUM_STATE) {
3208     if (0x0030 <= $self->{nc} and
3209     $self->{nc} <= 0x0039) { # 0..9
3210     !!!cp (1012);
3211 wakaba 1.12 $self->{kwd} *= 10;
3212     $self->{kwd} += $self->{nc} - 0x0030;
3213 wakaba 1.1
3214     ## Stay in the state.
3215     !!!next-input-character;
3216     redo A;
3217     } elsif ($self->{nc} == 0x003B) { # ;
3218     !!!cp (1013);
3219     !!!next-input-character;
3220     #
3221     } else {
3222     !!!cp (1014);
3223     !!!parse-error (type => 'no refc');
3224     ## Reconsume.
3225     #
3226     }
3227    
3228 wakaba 1.12 my $code = $self->{kwd};
3229 wakaba 1.1 my $l = $self->{line_prev};
3230     my $c = $self->{column_prev};
3231 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3232     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3233     ($self->{is_xml} and $code == 0x0000)) {
3234 wakaba 1.1 !!!cp (1015);
3235     !!!parse-error (type => 'invalid character reference',
3236     text => (sprintf 'U+%04X', $code),
3237     line => $l, column => $c);
3238     $code = $charref_map->{$code};
3239     } elsif ($code > 0x10FFFF) {
3240     !!!cp (1016);
3241     !!!parse-error (type => 'invalid character reference',
3242     text => (sprintf 'U-%08X', $code),
3243     line => $l, column => $c);
3244     $code = 0xFFFD;
3245     }
3246    
3247     if ($self->{prev_state} == DATA_STATE) {
3248     !!!cp (992);
3249     $self->{state} = $self->{prev_state};
3250 wakaba 1.5 $self->{s_kwd} = '';
3251 wakaba 1.1 ## Reconsume.
3252     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3253 wakaba 1.7 has_reference => 1,
3254 wakaba 1.1 line => $l, column => $c,
3255     });
3256     redo A;
3257     } else {
3258     !!!cp (991);
3259     $self->{ca}->{value} .= chr $code;
3260     $self->{ca}->{has_reference} = 1;
3261     $self->{state} = $self->{prev_state};
3262 wakaba 1.5 $self->{s_kwd} = '';
3263 wakaba 1.1 ## Reconsume.
3264     redo A;
3265     }
3266     } elsif ($self->{state} == HEXREF_X_STATE) {
3267     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3268     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3269     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3270     # 0..9, A..F, a..f
3271     !!!cp (990);
3272     $self->{state} = HEXREF_HEX_STATE;
3273 wakaba 1.12 $self->{kwd} = 0;
3274 wakaba 1.1 ## Reconsume.
3275     redo A;
3276     } else {
3277     !!!parse-error (type => 'bare hcro',
3278     line => $self->{line_prev},
3279     column => $self->{column_prev} - 2);
3280    
3281     ## NOTE: According to the spec algorithm, nothing is returned,
3282     ## and then "&#" followed by "X" or "x" is appended to the parent
3283     ## element or the attribute value in the later processing.
3284    
3285     if ($self->{prev_state} == DATA_STATE) {
3286     !!!cp (1005);
3287     $self->{state} = $self->{prev_state};
3288 wakaba 1.5 $self->{s_kwd} = '';
3289 wakaba 1.1 ## Reconsume.
3290     !!!emit ({type => CHARACTER_TOKEN,
3291 wakaba 1.12 data => '&' . $self->{kwd},
3292 wakaba 1.1 line => $self->{line_prev},
3293 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3294 wakaba 1.1 });
3295     redo A;
3296     } else {
3297     !!!cp (989);
3298 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3299 wakaba 1.1 $self->{state} = $self->{prev_state};
3300 wakaba 1.5 $self->{s_kwd} = '';
3301 wakaba 1.1 ## Reconsume.
3302     redo A;
3303     }
3304     }
3305     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3306     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3307     # 0..9
3308     !!!cp (1002);
3309 wakaba 1.12 $self->{kwd} *= 0x10;
3310     $self->{kwd} += $self->{nc} - 0x0030;
3311 wakaba 1.1 ## Stay in the state.
3312     !!!next-input-character;
3313     redo A;
3314     } elsif (0x0061 <= $self->{nc} and
3315     $self->{nc} <= 0x0066) { # a..f
3316     !!!cp (1003);
3317 wakaba 1.12 $self->{kwd} *= 0x10;
3318     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3319 wakaba 1.1 ## Stay in the state.
3320     !!!next-input-character;
3321     redo A;
3322     } elsif (0x0041 <= $self->{nc} and
3323     $self->{nc} <= 0x0046) { # A..F
3324     !!!cp (1004);
3325 wakaba 1.12 $self->{kwd} *= 0x10;
3326     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3327 wakaba 1.1 ## Stay in the state.
3328     !!!next-input-character;
3329     redo A;
3330     } elsif ($self->{nc} == 0x003B) { # ;
3331     !!!cp (1006);
3332     !!!next-input-character;
3333     #
3334     } else {
3335     !!!cp (1007);
3336     !!!parse-error (type => 'no refc',
3337     line => $self->{line},
3338     column => $self->{column});
3339     ## Reconsume.
3340     #
3341     }
3342    
3343 wakaba 1.12 my $code = $self->{kwd};
3344 wakaba 1.1 my $l = $self->{line_prev};
3345     my $c = $self->{column_prev};
3346 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3347     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3348     ($self->{is_xml} and $code == 0x0000)) {
3349 wakaba 1.1 !!!cp (1008);
3350     !!!parse-error (type => 'invalid character reference',
3351     text => (sprintf 'U+%04X', $code),
3352     line => $l, column => $c);
3353     $code = $charref_map->{$code};
3354     } elsif ($code > 0x10FFFF) {
3355     !!!cp (1009);
3356     !!!parse-error (type => 'invalid character reference',
3357     text => (sprintf 'U-%08X', $code),
3358     line => $l, column => $c);
3359     $code = 0xFFFD;
3360     }
3361    
3362     if ($self->{prev_state} == DATA_STATE) {
3363     !!!cp (988);
3364     $self->{state} = $self->{prev_state};
3365 wakaba 1.5 $self->{s_kwd} = '';
3366 wakaba 1.1 ## Reconsume.
3367     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3368 wakaba 1.7 has_reference => 1,
3369 wakaba 1.1 line => $l, column => $c,
3370     });
3371     redo A;
3372     } else {
3373     !!!cp (987);
3374     $self->{ca}->{value} .= chr $code;
3375     $self->{ca}->{has_reference} = 1;
3376     $self->{state} = $self->{prev_state};
3377 wakaba 1.5 $self->{s_kwd} = '';
3378 wakaba 1.1 ## Reconsume.
3379     redo A;
3380     }
3381     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3382 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3383     $self->{nc} <= 0x005A) or # x
3384     (0x0061 <= $self->{nc} and # a
3385     $self->{nc} <= 0x007A) or # z
3386     (0x0030 <= $self->{nc} and # 0
3387     $self->{nc} <= 0x0039) or # 9
3388 wakaba 1.22 $self->{nc} == 0x003B or # ;
3389     ($self->{is_xml} and
3390     not ($is_space->{$self->{nc}} or
3391     {
3392     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3393     $self->{entity_add} => 1,
3394     }->{$self->{nc}}))) {
3395 wakaba 1.1 our $EntityChar;
3396 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3397 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3398     $self->{ge}->{$self->{kwd}}) {
3399 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3400 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3401     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3402     !!!cp (1020.1);
3403     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3404     } else {
3405     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3406     !!!cp (1020.2);
3407     !!!parse-error (type => 'unparsed entity', ## TODO: type
3408     value => $self->{kwd});
3409     } else {
3410     !!!cp (1020.3);
3411     }
3412     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3413     }
3414     } else {
3415     if ($self->{is_xml}) {
3416     !!!cp (1020.4);
3417     !!!parse-error (type => 'entity not declared', ## TODO: type
3418     value => $self->{kwd},
3419     level => {
3420     'amp;' => $self->{level}->{warn},
3421     'quot;' => $self->{level}->{warn},
3422     'lt;' => $self->{level}->{warn},
3423     'gt;' => $self->{level}->{warn},
3424     'apos;' => $self->{level}->{warn},
3425     }->{$self->{kwd}} ||
3426     $self->{level}->{must});
3427     } else {
3428     !!!cp (1020);
3429     }
3430     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3431     }
3432 wakaba 1.1 $self->{entity__match} = 1;
3433     !!!next-input-character;
3434     #
3435     } else {
3436     !!!cp (1021);
3437 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3438 wakaba 1.1 $self->{entity__match} = -1;
3439     ## Stay in the state.
3440     !!!next-input-character;
3441     redo A;
3442     }
3443     } else {
3444     !!!cp (1022);
3445     $self->{entity__value} .= chr $self->{nc};
3446     $self->{entity__match} *= 2;
3447     ## Stay in the state.
3448     !!!next-input-character;
3449     redo A;
3450     }
3451     }
3452    
3453     my $data;
3454     my $has_ref;
3455     if ($self->{entity__match} > 0) {
3456     !!!cp (1023);
3457     $data = $self->{entity__value};
3458     $has_ref = 1;
3459     #
3460     } elsif ($self->{entity__match} < 0) {
3461     !!!parse-error (type => 'no refc');
3462     if ($self->{prev_state} != DATA_STATE and # in attribute
3463     $self->{entity__match} < -1) {
3464     !!!cp (1024);
3465 wakaba 1.12 $data = '&' . $self->{kwd};
3466 wakaba 1.1 #
3467     } else {
3468     !!!cp (1025);
3469     $data = $self->{entity__value};
3470     $has_ref = 1;
3471     #
3472     }
3473     } else {
3474     !!!cp (1026);
3475     !!!parse-error (type => 'bare ero',
3476     line => $self->{line_prev},
3477 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3478     $data = '&' . $self->{kwd};
3479 wakaba 1.1 #
3480     }
3481    
3482     ## NOTE: In these cases, when a character reference is found,
3483     ## it is consumed and a character token is returned, or, otherwise,
3484     ## nothing is consumed and returned, according to the spec algorithm.
3485     ## In this implementation, anything that has been examined by the
3486     ## tokenizer is appended to the parent element or the attribute value
3487     ## as string, either literal string when no character reference or
3488     ## entity-replaced string otherwise, in this stage, since any characters
3489     ## that would not be consumed are appended in the data state or in an
3490     ## appropriate attribute value state anyway.
3491    
3492     if ($self->{prev_state} == DATA_STATE) {
3493     !!!cp (986);
3494     $self->{state} = $self->{prev_state};
3495 wakaba 1.5 $self->{s_kwd} = '';
3496 wakaba 1.1 ## Reconsume.
3497     !!!emit ({type => CHARACTER_TOKEN,
3498     data => $data,
3499 wakaba 1.7 has_reference => $has_ref,
3500 wakaba 1.1 line => $self->{line_prev},
3501 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3502 wakaba 1.1 });
3503     redo A;
3504     } else {
3505     !!!cp (985);
3506     $self->{ca}->{value} .= $data;
3507     $self->{ca}->{has_reference} = 1 if $has_ref;
3508     $self->{state} = $self->{prev_state};
3509 wakaba 1.5 $self->{s_kwd} = '';
3510 wakaba 1.1 ## Reconsume.
3511     redo A;
3512     }
3513 wakaba 1.8
3514     ## XML-only states
3515    
3516     } elsif ($self->{state} == PI_STATE) {
3517 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3518    
3519 wakaba 1.8 if ($is_space->{$self->{nc}} or
3520 wakaba 1.14 $self->{nc} == 0x003F or # ?
3521 wakaba 1.8 $self->{nc} == -1) {
3522 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3523     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3524     ## "DOCTYPE pi state": Parse error, switch to the "data
3525     ## state".
3526 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3527     line => $self->{line_prev},
3528     column => $self->{column_prev}
3529     - 1 * ($self->{nc} != -1));
3530     $self->{state} = BOGUS_COMMENT_STATE;
3531     ## Reconsume.
3532     $self->{ct} = {type => COMMENT_TOKEN,
3533     data => '?',
3534     line => $self->{line_prev},
3535     column => $self->{column_prev}
3536     - 1 * ($self->{nc} != -1),
3537     };
3538     redo A;
3539     } else {
3540 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3541 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3542     target => chr $self->{nc},
3543     data => '',
3544     line => $self->{line_prev},
3545     column => $self->{column_prev} - 1,
3546     };
3547     $self->{state} = PI_TARGET_STATE;
3548     !!!next-input-character;
3549     redo A;
3550     }
3551     } elsif ($self->{state} == PI_TARGET_STATE) {
3552     if ($is_space->{$self->{nc}}) {
3553     $self->{state} = PI_TARGET_AFTER_STATE;
3554     !!!next-input-character;
3555     redo A;
3556     } elsif ($self->{nc} == -1) {
3557     !!!parse-error (type => 'no pic'); ## TODO: type
3558 wakaba 1.13 if ($self->{in_subset}) {
3559     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3560     } else {
3561     $self->{state} = DATA_STATE;
3562     $self->{s_kwd} = '';
3563     }
3564 wakaba 1.8 ## Reconsume.
3565     !!!emit ($self->{ct}); # pi
3566     redo A;
3567     } elsif ($self->{nc} == 0x003F) { # ?
3568     $self->{state} = PI_AFTER_STATE;
3569     !!!next-input-character;
3570     redo A;
3571     } else {
3572     ## XML5: typo ("tag name" -> "target")
3573     $self->{ct}->{target} .= chr $self->{nc}; # pi
3574     !!!next-input-character;
3575     redo A;
3576     }
3577     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3578     if ($is_space->{$self->{nc}}) {
3579     ## Stay in the state.
3580     !!!next-input-character;
3581     redo A;
3582     } else {
3583     $self->{state} = PI_DATA_STATE;
3584     ## Reprocess.
3585     redo A;
3586     }
3587     } elsif ($self->{state} == PI_DATA_STATE) {
3588     if ($self->{nc} == 0x003F) { # ?
3589     $self->{state} = PI_DATA_AFTER_STATE;
3590     !!!next-input-character;
3591     redo A;
3592     } elsif ($self->{nc} == -1) {
3593     !!!parse-error (type => 'no pic'); ## TODO: type
3594 wakaba 1.13 if ($self->{in_subset}) {
3595 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3596 wakaba 1.13 } else {
3597     $self->{state} = DATA_STATE;
3598     $self->{s_kwd} = '';
3599     }
3600 wakaba 1.8 ## Reprocess.
3601     !!!emit ($self->{ct}); # pi
3602     redo A;
3603     } else {
3604     $self->{ct}->{data} .= chr $self->{nc}; # pi
3605     $self->{read_until}->($self->{ct}->{data}, q[?],
3606     length $self->{ct}->{data});
3607     ## Stay in the state.
3608     !!!next-input-character;
3609     ## Reprocess.
3610     redo A;
3611     }
3612     } elsif ($self->{state} == PI_AFTER_STATE) {
3613 wakaba 1.14 ## XML5: Part of "Pi after state".
3614    
3615 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3616 wakaba 1.13 if ($self->{in_subset}) {
3617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618     } else {
3619     $self->{state} = DATA_STATE;
3620     $self->{s_kwd} = '';
3621     }
3622 wakaba 1.8 !!!next-input-character;
3623     !!!emit ($self->{ct}); # pi
3624     redo A;
3625     } elsif ($self->{nc} == 0x003F) { # ?
3626     !!!parse-error (type => 'no s after target', ## TODO: type
3627     line => $self->{line_prev},
3628     column => $self->{column_prev}); ## XML5: no error
3629     $self->{ct}->{data} .= '?';
3630     $self->{state} = PI_DATA_AFTER_STATE;
3631     !!!next-input-character;
3632     redo A;
3633     } else {
3634     !!!parse-error (type => 'no s after target', ## TODO: type
3635     line => $self->{line_prev},
3636     column => $self->{column_prev}
3637     + 1 * ($self->{nc} == -1)); ## XML5: no error
3638     $self->{ct}->{data} .= '?'; ## XML5: not appended
3639     $self->{state} = PI_DATA_STATE;
3640     ## Reprocess.
3641     redo A;
3642     }
3643     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3644 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3645    
3646 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3647 wakaba 1.13 if ($self->{in_subset}) {
3648     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3649     } else {
3650     $self->{state} = DATA_STATE;
3651     $self->{s_kwd} = '';
3652     }
3653 wakaba 1.8 !!!next-input-character;
3654     !!!emit ($self->{ct}); # pi
3655     redo A;
3656     } elsif ($self->{nc} == 0x003F) { # ?
3657     $self->{ct}->{data} .= '?';
3658     ## Stay in the state.
3659     !!!next-input-character;
3660     redo A;
3661     } else {
3662     $self->{ct}->{data} .= '?'; ## XML5: not appended
3663     $self->{state} = PI_DATA_STATE;
3664     ## Reprocess.
3665     redo A;
3666     }
3667 wakaba 1.12
3668     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3669     if ($self->{nc} == 0x003C) { # <
3670 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3671 wakaba 1.12 !!!next-input-character;
3672     redo A;
3673     } elsif ($self->{nc} == 0x0025) { # %
3674     ## XML5: Not defined yet.
3675    
3676     ## TODO:
3677 wakaba 1.24
3678     if (not $self->{stop_processing} and
3679     not $self->{document}->xml_standalone) {
3680     !!!parse-error (type => 'stop processing', ## TODO: type
3681     level => $self->{level}->{info});
3682     $self->{stop_processing} = 1;
3683     }
3684    
3685 wakaba 1.12 !!!next-input-character;
3686     redo A;
3687     } elsif ($self->{nc} == 0x005D) { # ]
3688 wakaba 1.13 delete $self->{in_subset};
3689 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3690     !!!next-input-character;
3691     redo A;
3692     } elsif ($is_space->{$self->{nc}}) {
3693     ## Stay in the state.
3694     !!!next-input-character;
3695     redo A;
3696     } elsif ($self->{nc} == -1) {
3697     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3698 wakaba 1.13 delete $self->{in_subset};
3699 wakaba 1.12 $self->{state} = DATA_STATE;
3700     $self->{s_kwd} = '';
3701     ## Reconsume.
3702 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3703 wakaba 1.12 redo A;
3704     } else {
3705     unless ($self->{internal_subset_tainted}) {
3706     ## XML5: No parse error.
3707     !!!parse-error (type => 'string in internal subset');
3708     $self->{internal_subset_tainted} = 1;
3709     }
3710     ## Stay in the state.
3711     !!!next-input-character;
3712     redo A;
3713     }
3714     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3715     if ($self->{nc} == 0x003E) { # >
3716     $self->{state} = DATA_STATE;
3717     $self->{s_kwd} = '';
3718     !!!next-input-character;
3719 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3720 wakaba 1.12 redo A;
3721     } elsif ($self->{nc} == -1) {
3722     !!!parse-error (type => 'unclosed DOCTYPE');
3723     $self->{state} = DATA_STATE;
3724     $self->{s_kwd} = '';
3725     ## Reconsume.
3726 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3727 wakaba 1.12 redo A;
3728     } else {
3729     ## XML5: No parse error and stay in the state.
3730     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3731    
3732 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3733     !!!next-input-character;
3734     redo A;
3735     }
3736     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3737     if ($self->{nc} == 0x003E) { # >
3738     $self->{state} = DATA_STATE;
3739     $self->{s_kwd} = '';
3740     !!!next-input-character;
3741     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3742     redo A;
3743     } elsif ($self->{nc} == -1) {
3744     $self->{state} = DATA_STATE;
3745     $self->{s_kwd} = '';
3746     ## Reconsume.
3747     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3748     redo A;
3749     } else {
3750     ## Stay in the state.
3751     !!!next-input-character;
3752     redo A;
3753     }
3754     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3755     if ($self->{nc} == 0x0021) { # !
3756 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3757 wakaba 1.13 !!!next-input-character;
3758     redo A;
3759     } elsif ($self->{nc} == 0x003F) { # ?
3760     $self->{state} = PI_STATE;
3761     !!!next-input-character;
3762     redo A;
3763     } elsif ($self->{nc} == -1) {
3764     !!!parse-error (type => 'bare stago');
3765     $self->{state} = DATA_STATE;
3766     $self->{s_kwd} = '';
3767     ## Reconsume.
3768     redo A;
3769     } else {
3770     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3771     line => $self->{line_prev},
3772     column => $self->{column_prev});
3773     $self->{state} = BOGUS_COMMENT_STATE;
3774     $self->{ct} = {type => COMMENT_TOKEN,
3775     data => '',
3776     }; ## NOTE: Will be discarded.
3777 wakaba 1.12 !!!next-input-character;
3778     redo A;
3779     }
3780 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3781     ## XML5: "DOCTYPE markup declaration state".
3782    
3783     if ($self->{nc} == 0x002D) { # -
3784     $self->{state} = MD_HYPHEN_STATE;
3785     !!!next-input-character;
3786     redo A;
3787 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3788     $self->{nc} == 0x0065) { # e
3789 wakaba 1.14 $self->{state} = MD_E_STATE;
3790     $self->{kwd} = chr $self->{nc};
3791     !!!next-input-character;
3792     redo A;
3793 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3794     $self->{nc} == 0x0061) { # a
3795 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3796     $self->{kwd} = chr $self->{nc};
3797     !!!next-input-character;
3798     redo A;
3799 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3800     $self->{nc} == 0x006E) { # n
3801 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3802     $self->{kwd} = chr $self->{nc};
3803     !!!next-input-character;
3804     redo A;
3805     } else {
3806     #
3807     }
3808    
3809     ## XML5: No parse error.
3810     !!!parse-error (type => 'bogus comment',
3811     line => $self->{line_prev},
3812     column => $self->{column_prev} - 1);
3813     ## Reconsume.
3814     $self->{state} = BOGUS_COMMENT_STATE;
3815     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3816     redo A;
3817     } elsif ($self->{state} == MD_E_STATE) {
3818 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3819     $self->{nc} == 0x006E) { # n
3820 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3821     $self->{kwd} .= chr $self->{nc};
3822     !!!next-input-character;
3823     redo A;
3824 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3825     $self->{nc} == 0x006C) { # l
3826 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3827     $self->{state} = MD_ELEMENT_STATE;
3828     $self->{kwd} .= chr $self->{nc};
3829     !!!next-input-character;
3830     redo A;
3831     } else {
3832     ## XML5: No parse error.
3833     !!!parse-error (type => 'bogus comment',
3834     line => $self->{line_prev},
3835     column => $self->{column_prev} - 2
3836     + 1 * ($self->{nc} == -1));
3837     ## Reconsume.
3838     $self->{state} = BOGUS_COMMENT_STATE;
3839     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3840     redo A;
3841     }
3842     } elsif ($self->{state} == MD_ENTITY_STATE) {
3843 wakaba 1.17 if ($self->{nc} == [
3844     undef,
3845     undef,
3846     0x0054, # T
3847     0x0049, # I
3848     0x0054, # T
3849     ]->[length $self->{kwd}] or
3850     $self->{nc} == [
3851     undef,
3852     undef,
3853     0x0074, # t
3854     0x0069, # i
3855     0x0074, # t
3856     ]->[length $self->{kwd}]) {
3857 wakaba 1.14 ## Stay in the state.
3858     $self->{kwd} .= chr $self->{nc};
3859     !!!next-input-character;
3860     redo A;
3861 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3862     ($self->{nc} == 0x0059 or # Y
3863     $self->{nc} == 0x0079)) { # y
3864     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3865     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3866     text => 'ENTITY',
3867     line => $self->{line_prev},
3868     column => $self->{column_prev} - 4);
3869     }
3870     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3871 wakaba 1.14 line => $self->{line_prev},
3872     column => $self->{column_prev} - 6};
3873     $self->{state} = DOCTYPE_MD_STATE;
3874     !!!next-input-character;
3875     redo A;
3876     } else {
3877     !!!parse-error (type => 'bogus comment',
3878     line => $self->{line_prev},
3879     column => $self->{column_prev} - 1
3880     - (length $self->{kwd})
3881     + 1 * ($self->{nc} == -1));
3882     $self->{state} = BOGUS_COMMENT_STATE;
3883     ## Reconsume.
3884     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3885     redo A;
3886     }
3887     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3888 wakaba 1.17 if ($self->{nc} == [
3889     undef,
3890     undef,
3891     0x0045, # E
3892     0x004D, # M
3893     0x0045, # E
3894     0x004E, # N
3895     ]->[length $self->{kwd}] or
3896     $self->{nc} == [
3897     undef,
3898     undef,
3899     0x0065, # e
3900     0x006D, # m
3901     0x0065, # e
3902     0x006E, # n
3903     ]->[length $self->{kwd}]) {
3904 wakaba 1.14 ## Stay in the state.
3905     $self->{kwd} .= chr $self->{nc};
3906     !!!next-input-character;
3907     redo A;
3908 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3909     ($self->{nc} == 0x0054 or # T
3910     $self->{nc} == 0x0074)) { # t
3911     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3912     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3913     text => 'ELEMENT',
3914     line => $self->{line_prev},
3915     column => $self->{column_prev} - 5);
3916     }
3917 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3918     line => $self->{line_prev},
3919 wakaba 1.23 column => $self->{column_prev} - 7};
3920 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3921     !!!next-input-character;
3922     redo A;
3923     } else {
3924     !!!parse-error (type => 'bogus comment',
3925     line => $self->{line_prev},
3926     column => $self->{column_prev} - 1
3927     - (length $self->{kwd})
3928     + 1 * ($self->{nc} == -1));
3929     $self->{state} = BOGUS_COMMENT_STATE;
3930     ## Reconsume.
3931     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3932     redo A;
3933     }
3934     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3935 wakaba 1.17 if ($self->{nc} == [
3936     undef,
3937     0x0054, # T
3938     0x0054, # T
3939     0x004C, # L
3940     0x0049, # I
3941     0x0053, # S
3942     ]->[length $self->{kwd}] or
3943     $self->{nc} == [
3944     undef,
3945     0x0074, # t
3946     0x0074, # t
3947     0x006C, # l
3948     0x0069, # i
3949     0x0073, # s
3950     ]->[length $self->{kwd}]) {
3951 wakaba 1.14 ## Stay in the state.
3952     $self->{kwd} .= chr $self->{nc};
3953     !!!next-input-character;
3954     redo A;
3955 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3956     ($self->{nc} == 0x0054 or # T
3957     $self->{nc} == 0x0074)) { # t
3958     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3959     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3960     text => 'ATTLIST',
3961     line => $self->{line_prev},
3962     column => $self->{column_prev} - 5);
3963     }
3964 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3965 wakaba 1.15 attrdefs => [],
3966 wakaba 1.14 line => $self->{line_prev},
3967 wakaba 1.23 column => $self->{column_prev} - 7};
3968 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3969     !!!next-input-character;
3970     redo A;
3971     } else {
3972     !!!parse-error (type => 'bogus comment',
3973     line => $self->{line_prev},
3974     column => $self->{column_prev} - 1
3975     - (length $self->{kwd})
3976     + 1 * ($self->{nc} == -1));
3977     $self->{state} = BOGUS_COMMENT_STATE;
3978     ## Reconsume.
3979     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3980     redo A;
3981     }
3982     } elsif ($self->{state} == MD_NOTATION_STATE) {
3983 wakaba 1.17 if ($self->{nc} == [
3984     undef,
3985     0x004F, # O
3986     0x0054, # T
3987     0x0041, # A
3988     0x0054, # T
3989     0x0049, # I
3990     0x004F, # O
3991     ]->[length $self->{kwd}] or
3992     $self->{nc} == [
3993     undef,
3994     0x006F, # o
3995     0x0074, # t
3996     0x0061, # a
3997     0x0074, # t
3998     0x0069, # i
3999     0x006F, # o
4000     ]->[length $self->{kwd}]) {
4001 wakaba 1.14 ## Stay in the state.
4002     $self->{kwd} .= chr $self->{nc};
4003     !!!next-input-character;
4004     redo A;
4005 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4006     ($self->{nc} == 0x004E or # N
4007     $self->{nc} == 0x006E)) { # n
4008     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4009     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4010     text => 'NOTATION',
4011     line => $self->{line_prev},
4012     column => $self->{column_prev} - 6);
4013     }
4014 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4015     line => $self->{line_prev},
4016 wakaba 1.23 column => $self->{column_prev} - 8};
4017 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4018     !!!next-input-character;
4019     redo A;
4020     } else {
4021     !!!parse-error (type => 'bogus comment',
4022     line => $self->{line_prev},
4023     column => $self->{column_prev} - 1
4024     - (length $self->{kwd})
4025     + 1 * ($self->{nc} == -1));
4026     $self->{state} = BOGUS_COMMENT_STATE;
4027     ## Reconsume.
4028     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4029     redo A;
4030     }
4031     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4032     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4033     ## "DOCTYPE NOTATION state".
4034    
4035     if ($is_space->{$self->{nc}}) {
4036     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4037     $self->{state} = BEFORE_MD_NAME_STATE;
4038     !!!next-input-character;
4039     redo A;
4040     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4041     $self->{nc} == 0x0025) { # %
4042     ## XML5: Switch to the "DOCTYPE bogus comment state".
4043     !!!parse-error (type => 'no space before md name'); ## TODO: type
4044     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4045     !!!next-input-character;
4046     redo A;
4047     } elsif ($self->{nc} == -1) {
4048     !!!parse-error (type => 'unclosed md'); ## TODO: type
4049     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4050     ## Reconsume.
4051     redo A;
4052     } elsif ($self->{nc} == 0x003E) { # >
4053     ## XML5: Switch to the "DOCTYPE bogus comment state".
4054     !!!parse-error (type => 'no md name'); ## TODO: type
4055     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4056     !!!next-input-character;
4057     redo A;
4058     } else {
4059     ## XML5: Switch to the "DOCTYPE bogus comment state".
4060     !!!parse-error (type => 'no space before md name'); ## TODO: type
4061     $self->{state} = BEFORE_MD_NAME_STATE;
4062     redo A;
4063     }
4064     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4065     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4066     ## before state", "DOCTYPE ATTLIST name before state".
4067    
4068     if ($is_space->{$self->{nc}}) {
4069     ## Stay in the state.
4070     !!!next-input-character;
4071     redo A;
4072     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4073     $self->{nc} == 0x0025) { # %
4074     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4075     !!!next-input-character;
4076     redo A;
4077     } elsif ($self->{nc} == 0x003E) { # >
4078     ## XML5: Same as "Anything else".
4079     !!!parse-error (type => 'no md name'); ## TODO: type
4080     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4081     !!!next-input-character;
4082     redo A;
4083     } elsif ($self->{nc} == -1) {
4084     !!!parse-error (type => 'unclosed md'); ## TODO: type
4085     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4086     ## Reconsume.
4087     redo A;
4088     } else {
4089     ## XML5: [ATTLIST] Not defined yet.
4090     $self->{ct}->{name} .= chr $self->{nc};
4091     $self->{state} = MD_NAME_STATE;
4092     !!!next-input-character;
4093     redo A;
4094     }
4095     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4096     if ($is_space->{$self->{nc}}) {
4097     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4098     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4099     $self->{state} = BEFORE_MD_NAME_STATE;
4100     !!!next-input-character;
4101     redo A;
4102     } elsif ($self->{nc} == 0x003E) { # >
4103     ## XML5: Same as "Anything else".
4104     !!!parse-error (type => 'no md name'); ## TODO: type
4105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4106     !!!next-input-character;
4107     redo A;
4108     } elsif ($self->{nc} == -1) {
4109     !!!parse-error (type => 'unclosed md');
4110     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4111     ## Reconsume.
4112     redo A;
4113     } else {
4114     ## XML5: No parse error.
4115     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4116     $self->{state} = BOGUS_COMMENT_STATE;
4117     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4118     ## Reconsume.
4119     redo A;
4120     }
4121     } elsif ($self->{state} == MD_NAME_STATE) {
4122     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4123    
4124     if ($is_space->{$self->{nc}}) {
4125 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4126     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4127     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4128 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4129 wakaba 1.16 } else { # ENTITY/NOTATION
4130     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4131     }
4132 wakaba 1.14 !!!next-input-character;
4133     redo A;
4134     } elsif ($self->{nc} == 0x003E) { # >
4135     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4136     #
4137     } else {
4138 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4139 wakaba 1.14 }
4140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4141     !!!next-input-character;
4142     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4143     redo A;
4144     } elsif ($self->{nc} == -1) {
4145     ## XML5: [ATTLIST] No parse error.
4146     !!!parse-error (type => 'unclosed md');
4147     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4148     ## Reconsume.
4149     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4150     redo A;
4151     } else {
4152     ## XML5: [ATTLIST] Not defined yet.
4153     $self->{ct}->{name} .= chr $self->{nc};
4154     ## Stay in the state.
4155     !!!next-input-character;
4156     redo A;
4157     }
4158     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4159     if ($is_space->{$self->{nc}}) {
4160     ## Stay in the state.
4161     !!!next-input-character;
4162     redo A;
4163     } elsif ($self->{nc} == 0x003E) { # >
4164     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4165     !!!next-input-character;
4166     !!!emit ($self->{ct}); # ATTLIST
4167     redo A;
4168     } elsif ($self->{nc} == -1) {
4169     ## XML5: No parse error.
4170     !!!parse-error (type => 'unclosed md'); ## TODO: type
4171     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4172 wakaba 1.15 !!!emit ($self->{ct});
4173     redo A;
4174     } else {
4175     ## XML5: Not defined yet.
4176     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4177     tokens => [],
4178     line => $self->{line}, column => $self->{column}};
4179     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4180     !!!next-input-character;
4181     redo A;
4182     }
4183     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4184     if ($is_space->{$self->{nc}}) {
4185     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4186     !!!next-input-character;
4187     redo A;
4188     } elsif ($self->{nc} == 0x003E) { # >
4189     ## XML5: Same as "anything else".
4190     !!!parse-error (type => 'no attr type'); ## TODO: type
4191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192     !!!next-input-character;
4193     !!!emit ($self->{ct}); # ATTLIST
4194     redo A;
4195     } elsif ($self->{nc} == 0x0028) { # (
4196     ## XML5: Same as "anything else".
4197     !!!parse-error (type => 'no space before paren'); ## TODO: type
4198     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4199     !!!next-input-character;
4200     redo A;
4201     } elsif ($self->{nc} == -1) {
4202     ## XML5: No parse error.
4203     !!!parse-error (type => 'unclosed md'); ## TODO: type
4204     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4205     !!!next-input-character;
4206     !!!emit ($self->{ct}); # ATTLIST
4207     redo A;
4208     } else {
4209     ## XML5: Not defined yet.
4210     $self->{ca}->{name} .= chr $self->{nc};
4211     ## Stay in the state.
4212     !!!next-input-character;
4213     redo A;
4214     }
4215     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4216     if ($is_space->{$self->{nc}}) {
4217     ## Stay in the state.
4218     !!!next-input-character;
4219     redo A;
4220     } elsif ($self->{nc} == 0x003E) { # >
4221     ## XML5: Same as "anything else".
4222     !!!parse-error (type => 'no attr type'); ## TODO: type
4223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224     !!!next-input-character;
4225     !!!emit ($self->{ct}); # ATTLIST
4226     redo A;
4227     } elsif ($self->{nc} == 0x0028) { # (
4228     ## XML5: Same as "anything else".
4229     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4230     !!!next-input-character;
4231     redo A;
4232     } elsif ($self->{nc} == -1) {
4233     ## XML5: No parse error.
4234     !!!parse-error (type => 'unclosed md'); ## TODO: type
4235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4236     !!!next-input-character;
4237     !!!emit ($self->{ct});
4238 wakaba 1.14 redo A;
4239     } else {
4240     ## XML5: Not defined yet.
4241 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4242     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4243     !!!next-input-character;
4244     redo A;
4245     }
4246     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4247     if ($is_space->{$self->{nc}}) {
4248     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4249     !!!next-input-character;
4250     redo A;
4251     } elsif ($self->{nc} == 0x0023) { # #
4252     ## XML5: Same as "anything else".
4253     !!!parse-error (type => 'no space before default value'); ## TODO: type
4254     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4255     !!!next-input-character;
4256     redo A;
4257     } elsif ($self->{nc} == 0x0022) { # "
4258     ## XML5: Same as "anything else".
4259     !!!parse-error (type => 'no space before default value'); ## TODO: type
4260     $self->{ca}->{value} = '';
4261     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4262     !!!next-input-character;
4263     redo A;
4264     } elsif ($self->{nc} == 0x0027) { # '
4265     ## XML5: Same as "anything else".
4266     !!!parse-error (type => 'no space before default value'); ## TODO: type
4267     $self->{ca}->{value} = '';
4268     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4269     !!!next-input-character;
4270     redo A;
4271     } elsif ($self->{nc} == 0x003E) { # >
4272     ## XML5: Same as "anything else".
4273     !!!parse-error (type => 'no attr default'); ## TODO: type
4274     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275     !!!next-input-character;
4276     !!!emit ($self->{ct}); # ATTLIST
4277     redo A;
4278     } elsif ($self->{nc} == 0x0028) { # (
4279     ## XML5: Same as "anything else".
4280     !!!parse-error (type => 'no space before paren'); ## TODO: type
4281     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4282     !!!next-input-character;
4283     redo A;
4284     } elsif ($self->{nc} == -1) {
4285     ## XML5: No parse error.
4286     !!!parse-error (type => 'unclosed md'); ## TODO: type
4287     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4288     !!!next-input-character;
4289     !!!emit ($self->{ct});
4290     redo A;
4291     } else {
4292     ## XML5: Not defined yet.
4293     $self->{ca}->{type} .= chr $self->{nc};
4294     ## Stay in the state.
4295     !!!next-input-character;
4296     redo A;
4297     }
4298     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4299     if ($is_space->{$self->{nc}}) {
4300     ## Stay in the state.
4301     !!!next-input-character;
4302     redo A;
4303     } elsif ($self->{nc} == 0x0028) { # (
4304     ## XML5: Same as "anything else".
4305     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306     !!!next-input-character;
4307     redo A;
4308     } elsif ($self->{nc} == 0x0023) { # #
4309     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4310     !!!next-input-character;
4311     redo A;
4312     } elsif ($self->{nc} == 0x0022) { # "
4313     ## XML5: Same as "anything else".
4314     $self->{ca}->{value} = '';
4315     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4316     !!!next-input-character;
4317     redo A;
4318     } elsif ($self->{nc} == 0x0027) { # '
4319     ## XML5: Same as "anything else".
4320     $self->{ca}->{value} = '';
4321     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4322     !!!next-input-character;
4323     redo A;
4324     } elsif ($self->{nc} == 0x003E) { # >
4325     ## XML5: Same as "anything else".
4326     !!!parse-error (type => 'no attr default'); ## TODO: type
4327     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4328     !!!next-input-character;
4329     !!!emit ($self->{ct}); # ATTLIST
4330     redo A;
4331     } elsif ($self->{nc} == -1) {
4332     ## XML5: No parse error.
4333     !!!parse-error (type => 'unclosed md'); ## TODO: type
4334     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4335     !!!next-input-character;
4336     !!!emit ($self->{ct});
4337     redo A;
4338     } else {
4339     ## XML5: Switch to the "DOCTYPE bogus comment state".
4340     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4341     $self->{ca}->{value} = '';
4342     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4343     ## Reconsume.
4344     redo A;
4345     }
4346     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4347     if ($is_space->{$self->{nc}}) {
4348     ## Stay in the state.
4349     !!!next-input-character;
4350     redo A;
4351     } elsif ($self->{nc} == 0x007C) { # |
4352     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4353     ## Stay in the state.
4354     !!!next-input-character;
4355     redo A;
4356     } elsif ($self->{nc} == 0x0029) { # )
4357     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4358     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4359     !!!next-input-character;
4360     redo A;
4361     } elsif ($self->{nc} == 0x003E) { # >
4362     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364     !!!next-input-character;
4365     !!!emit ($self->{ct}); # ATTLIST
4366     redo A;
4367     } elsif ($self->{nc} == -1) {
4368     ## XML5: No parse error.
4369     !!!parse-error (type => 'unclosed md'); ## TODO: type
4370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4371     !!!next-input-character;
4372     !!!emit ($self->{ct});
4373     redo A;
4374     } else {
4375     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4376     $self->{state} = ALLOWED_TOKEN_STATE;
4377     !!!next-input-character;
4378     redo A;
4379     }
4380     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4381     if ($is_space->{$self->{nc}}) {
4382     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4383     !!!next-input-character;
4384     redo A;
4385     } elsif ($self->{nc} == 0x007C) { # |
4386     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4387     !!!next-input-character;
4388     redo A;
4389     } elsif ($self->{nc} == 0x0029) { # )
4390     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4391     !!!next-input-character;
4392     redo A;
4393     } elsif ($self->{nc} == 0x003E) { # >
4394     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4395     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4396     !!!next-input-character;
4397     !!!emit ($self->{ct}); # ATTLIST
4398     redo A;
4399     } elsif ($self->{nc} == -1) {
4400     ## XML5: No parse error.
4401     !!!parse-error (type => 'unclosed md'); ## TODO: type
4402     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4403     !!!next-input-character;
4404     !!!emit ($self->{ct});
4405     redo A;
4406     } else {
4407     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4408     ## Stay in the state.
4409     !!!next-input-character;
4410     redo A;
4411     }
4412     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4413     if ($is_space->{$self->{nc}}) {
4414     ## Stay in the state.
4415     !!!next-input-character;
4416     redo A;
4417     } elsif ($self->{nc} == 0x007C) { # |
4418     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4419     !!!next-input-character;
4420     redo A;
4421     } elsif ($self->{nc} == 0x0029) { # )
4422     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4423     !!!next-input-character;
4424     redo A;
4425     } elsif ($self->{nc} == 0x003E) { # >
4426     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428     !!!next-input-character;
4429     !!!emit ($self->{ct}); # ATTLIST
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432     ## XML5: No parse error.
4433     !!!parse-error (type => 'unclosed md'); ## TODO: type
4434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4435     !!!next-input-character;
4436     !!!emit ($self->{ct});
4437     redo A;
4438     } else {
4439     !!!parse-error (type => 'space in allowed token', ## TODO: type
4440     line => $self->{line_prev},
4441     column => $self->{column_prev});
4442     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4443     $self->{state} = ALLOWED_TOKEN_STATE;
4444     !!!next-input-character;
4445     redo A;
4446     }
4447     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4448     if ($is_space->{$self->{nc}}) {
4449     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4450     !!!next-input-character;
4451     redo A;
4452     } elsif ($self->{nc} == 0x0023) { # #
4453     !!!parse-error (type => 'no space before default value'); ## TODO: type
4454     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4455     !!!next-input-character;
4456     redo A;
4457     } elsif ($self->{nc} == 0x0022) { # "
4458     !!!parse-error (type => 'no space before default value'); ## TODO: type
4459     $self->{ca}->{value} = '';
4460     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4461     !!!next-input-character;
4462     redo A;
4463     } elsif ($self->{nc} == 0x0027) { # '
4464     !!!parse-error (type => 'no space before default value'); ## TODO: type
4465     $self->{ca}->{value} = '';
4466     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4467     !!!next-input-character;
4468     redo A;
4469     } elsif ($self->{nc} == 0x003E) { # >
4470     !!!parse-error (type => 'no attr default'); ## TODO: type
4471     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4472     !!!next-input-character;
4473     !!!emit ($self->{ct}); # ATTLIST
4474     redo A;
4475     } elsif ($self->{nc} == -1) {
4476     !!!parse-error (type => 'unclosed md'); ## TODO: type
4477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4478     !!!next-input-character;
4479     !!!emit ($self->{ct});
4480     redo A;
4481     } else {
4482     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4483     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4484     ## Reconsume.
4485     redo A;
4486     }
4487     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4488     if ($is_space->{$self->{nc}}) {
4489     ## Stay in the state.
4490     !!!next-input-character;
4491     redo A;
4492     } elsif ($self->{nc} == 0x0023) { # #
4493     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4494     !!!next-input-character;
4495     redo A;
4496     } elsif ($self->{nc} == 0x0022) { # "
4497     $self->{ca}->{value} = '';
4498     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4499     !!!next-input-character;
4500     redo A;
4501     } elsif ($self->{nc} == 0x0027) { # '
4502     $self->{ca}->{value} = '';
4503     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4504     !!!next-input-character;
4505     redo A;
4506     } elsif ($self->{nc} == 0x003E) { # >
4507     !!!parse-error (type => 'no attr default'); ## TODO: type
4508     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509     !!!next-input-character;
4510     !!!emit ($self->{ct}); # ATTLIST
4511     redo A;
4512     } elsif ($self->{nc} == -1) {
4513     !!!parse-error (type => 'unclosed md'); ## TODO: type
4514     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4515     !!!next-input-character;
4516     !!!emit ($self->{ct});
4517     redo A;
4518     } else {
4519     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4520     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4521     ## Reconsume.
4522     redo A;
4523     }
4524     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4525     if ($is_space->{$self->{nc}}) {
4526     ## XML5: No parse error.
4527     !!!parse-error (type => 'no default type'); ## TODO: type
4528 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4529 wakaba 1.14 ## Reconsume.
4530     redo A;
4531 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4532     ## XML5: Same as "anything else".
4533     $self->{ca}->{value} = '';
4534     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4535     !!!next-input-character;
4536     redo A;
4537     } elsif ($self->{nc} == 0x0027) { # '
4538     ## XML5: Same as "anything else".
4539     $self->{ca}->{value} = '';
4540     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4541     !!!next-input-character;
4542     redo A;
4543     } elsif ($self->{nc} == 0x003E) { # >
4544     ## XML5: Same as "anything else".
4545     !!!parse-error (type => 'no attr default'); ## TODO: type
4546     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4547     !!!next-input-character;
4548     !!!emit ($self->{ct}); # ATTLIST
4549     redo A;
4550     } elsif ($self->{nc} == -1) {
4551     ## XML5: No parse error.
4552     !!!parse-error (type => 'unclosed md'); ## TODO: type
4553     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4554     !!!next-input-character;
4555     !!!emit ($self->{ct});
4556     redo A;
4557     } else {
4558     $self->{ca}->{default} = chr $self->{nc};
4559     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4560     !!!next-input-character;
4561     redo A;
4562 wakaba 1.14 }
4563 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4564     if ($is_space->{$self->{nc}}) {
4565     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4566     !!!next-input-character;
4567     redo A;
4568     } elsif ($self->{nc} == 0x0022) { # "
4569     ## XML5: Same as "anything else".
4570     !!!parse-error (type => 'no space before default value'); ## TODO: type
4571     $self->{ca}->{value} = '';
4572     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4573     !!!next-input-character;
4574     redo A;
4575     } elsif ($self->{nc} == 0x0027) { # '
4576     ## XML5: Same as "anything else".
4577     !!!parse-error (type => 'no space before default value'); ## TODO: type
4578     $self->{ca}->{value} = '';
4579     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4580     !!!next-input-character;
4581     redo A;
4582     } elsif ($self->{nc} == 0x003E) { # >
4583     ## XML5: Same as "anything else".
4584     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4585     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4586     !!!next-input-character;
4587     !!!emit ($self->{ct}); # ATTLIST
4588     redo A;
4589     } elsif ($self->{nc} == -1) {
4590     ## XML5: No parse error.
4591     !!!parse-error (type => 'unclosed md'); ## TODO: type
4592     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4593     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4594     !!!next-input-character;
4595     !!!emit ($self->{ct});
4596     redo A;
4597     } else {
4598     $self->{ca}->{default} .= chr $self->{nc};
4599     ## Stay in the state.
4600     !!!next-input-character;
4601     redo A;
4602     }
4603     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4604     if ($is_space->{$self->{nc}}) {
4605     ## Stay in the state.
4606     !!!next-input-character;
4607     redo A;
4608     } elsif ($self->{nc} == 0x0022) { # "
4609     $self->{ca}->{value} = '';
4610     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4611     !!!next-input-character;
4612     redo A;
4613     } elsif ($self->{nc} == 0x0027) { # '
4614     $self->{ca}->{value} = '';
4615     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4616     !!!next-input-character;
4617     redo A;
4618     } elsif ($self->{nc} == 0x003E) { # >
4619     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4620     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4621     !!!next-input-character;
4622     !!!emit ($self->{ct}); # ATTLIST
4623     redo A;
4624     } elsif ($self->{nc} == -1) {
4625     ## XML5: No parse error.
4626     !!!parse-error (type => 'unclosed md'); ## TODO: type
4627     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4628     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4629     !!!next-input-character;
4630     !!!emit ($self->{ct});
4631     redo A;
4632     } else {
4633     ## XML5: Not defined yet.
4634     if ($self->{ca}->{default} eq 'FIXED') {
4635     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4636     } else {
4637     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4638     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4639     }
4640     ## Reconsume.
4641     redo A;
4642     }
4643     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4644     if ($is_space->{$self->{nc}} or
4645     $self->{nc} == -1 or
4646     $self->{nc} == 0x003E) { # >
4647     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4648     ## Reconsume.
4649     redo A;
4650     } else {
4651     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4652     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4653     ## Reconsume.
4654     redo A;
4655 wakaba 1.16 }
4656 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4657     ## ASCII case-insensitive
4658     if ($self->{nc} == [
4659     undef,
4660     0x0044, # D
4661     0x0041, # A
4662     0x0054, # T
4663     ]->[length $self->{kwd}] or
4664     $self->{nc} == [
4665     undef,
4666     0x0064, # d
4667     0x0061, # a
4668     0x0074, # t
4669     ]->[length $self->{kwd}]) {
4670     !!!cp (172.2);
4671     ## Stay in the state.
4672     $self->{kwd} .= chr $self->{nc};
4673     !!!next-input-character;
4674     redo A;
4675     } elsif ((length $self->{kwd}) == 4 and
4676     ($self->{nc} == 0x0041 or # A
4677     $self->{nc} == 0x0061)) { # a
4678     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4679     !!!cp (172.3);
4680     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4681     text => 'NDATA',
4682     line => $self->{line_prev},
4683     column => $self->{column_prev} - 4);
4684     } else {
4685     !!!cp (172.4);
4686     }
4687     $self->{state} = AFTER_NDATA_STATE;
4688     !!!next-input-character;
4689     redo A;
4690     } else {
4691     !!!parse-error (type => 'string after literal', ## TODO: type
4692     line => $self->{line_prev},
4693     column => $self->{column_prev} + 1
4694     - length $self->{kwd});
4695     !!!cp (172.5);
4696     $self->{state} = BOGUS_MD_STATE;
4697     ## Reconsume.
4698     redo A;
4699     }
4700     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4701     if ($is_space->{$self->{nc}}) {
4702     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4703     !!!next-input-character;
4704     redo A;
4705     } elsif ($self->{nc} == 0x003E) { # >
4706     !!!parse-error (type => 'no notation name'); ## TODO: type
4707     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4708     !!!next-input-character;
4709     !!!emit ($self->{ct}); # ENTITY
4710     redo A;
4711     } elsif ($self->{nc} == -1) {
4712     !!!parse-error (type => 'unclosed md'); ## TODO: type
4713     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4714     !!!next-input-character;
4715     !!!emit ($self->{ct}); # ENTITY
4716     redo A;
4717     } else {
4718     !!!parse-error (type => 'string after literal', ## TODO: type
4719     line => $self->{line_prev},
4720     column => $self->{column_prev} + 1
4721     - length $self->{kwd});
4722     $self->{state} = BOGUS_MD_STATE;
4723     ## Reconsume.
4724     redo A;
4725     }
4726     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4727     if ($is_space->{$self->{nc}}) {
4728     ## Stay in the state.
4729     !!!next-input-character;
4730     redo A;
4731     } elsif ($self->{nc} == 0x003E) { # >
4732     !!!parse-error (type => 'no notation name'); ## TODO: type
4733     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4734     !!!next-input-character;
4735     !!!emit ($self->{ct}); # ENTITY
4736     redo A;
4737     } elsif ($self->{nc} == -1) {
4738     !!!parse-error (type => 'unclosed md'); ## TODO: type
4739     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4740     !!!next-input-character;
4741     !!!emit ($self->{ct}); # ENTITY
4742     redo A;
4743     } else {
4744     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4745     $self->{state} = NOTATION_NAME_STATE;
4746     !!!next-input-character;
4747     redo A;
4748     }
4749     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4750     if ($is_space->{$self->{nc}}) {
4751 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4752 wakaba 1.18 !!!next-input-character;
4753     redo A;
4754     } elsif ($self->{nc} == 0x003E) { # >
4755     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4756     !!!next-input-character;
4757     !!!emit ($self->{ct}); # ENTITY
4758     redo A;
4759     } elsif ($self->{nc} == -1) {
4760     !!!parse-error (type => 'unclosed md'); ## TODO: type
4761     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4762     !!!next-input-character;
4763     !!!emit ($self->{ct}); # ENTITY
4764     redo A;
4765     } else {
4766     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4767     ## Stay in the state.
4768     !!!next-input-character;
4769     redo A;
4770     }
4771 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4772     if ($self->{nc} == 0x0022) { # "
4773 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4774 wakaba 1.19 !!!next-input-character;
4775     redo A;
4776     } elsif ($self->{nc} == 0x0026) { # &
4777     $self->{prev_state} = $self->{state};
4778     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4779     $self->{entity_add} = 0x0022; # "
4780     !!!next-input-character;
4781     redo A;
4782     ## TODO: %
4783     } elsif ($self->{nc} == -1) {
4784     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4785     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4786     ## Reconsume.
4787     !!!emit ($self->{ct}); # ENTITY
4788     redo A;
4789     } else {
4790     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4791     !!!next-input-character;
4792     redo A;
4793     }
4794     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4795     if ($self->{nc} == 0x0027) { # '
4796 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4797 wakaba 1.19 !!!next-input-character;
4798     redo A;
4799     } elsif ($self->{nc} == 0x0026) { # &
4800     $self->{prev_state} = $self->{state};
4801     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4802     $self->{entity_add} = 0x0027; # '
4803     !!!next-input-character;
4804     redo A;
4805     ## TODO: %
4806     } elsif ($self->{nc} == -1) {
4807     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4808     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4809     ## Reconsume.
4810     !!!emit ($self->{ct}); # ENTITY
4811     redo A;
4812     } else {
4813     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4814     !!!next-input-character;
4815     redo A;
4816     }
4817     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4818     if ($is_space->{$self->{nc}} or
4819     {
4820     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4821     $self->{entity_add} => 1,
4822     }->{$self->{nc}}) {
4823 wakaba 1.22 !!!parse-error (type => 'bare ero',
4824     line => $self->{line_prev},
4825     column => $self->{column_prev}
4826     + ($self->{nc} == -1 ? 1 : 0));
4827 wakaba 1.19 ## Don't consume
4828     ## Return nothing.
4829     #
4830     } elsif ($self->{nc} == 0x0023) { # #
4831     $self->{ca} = $self->{ct};
4832     $self->{state} = ENTITY_HASH_STATE;
4833     $self->{kwd} = '#';
4834     !!!next-input-character;
4835     redo A;
4836     } else {
4837     #
4838     }
4839    
4840     $self->{ct}->{value} .= '&';
4841     $self->{state} = $self->{prev_state};
4842     ## Reconsume.
4843     redo A;
4844 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4845     if ($is_space->{$self->{nc}}) {
4846     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4847     !!!next-input-character;
4848     redo A;
4849     } elsif ($self->{nc} == 0x0028) { # (
4850     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4851     $self->{ct}->{content} = ['('];
4852     $self->{group_depth} = 1;
4853     !!!next-input-character;
4854     redo A;
4855     } elsif ($self->{nc} == 0x003E) { # >
4856     !!!parse-error (type => 'no md def'); ## TODO: type
4857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4858     !!!next-input-character;
4859     !!!emit ($self->{ct}); # ELEMENT
4860     redo A;
4861     } elsif ($self->{nc} == -1) {
4862     !!!parse-error (type => 'unclosed md'); ## TODO: type
4863     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4864     !!!next-input-character;
4865     !!!emit ($self->{ct}); # ELEMENT
4866     redo A;
4867     } else {
4868     $self->{ct}->{content} = [chr $self->{nc}];
4869     $self->{state} = CONTENT_KEYWORD_STATE;
4870     !!!next-input-character;
4871     redo A;
4872     }
4873     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4874     if ($is_space->{$self->{nc}}) {
4875     $self->{state} = AFTER_MD_DEF_STATE;
4876     !!!next-input-character;
4877     redo A;
4878     } elsif ($self->{nc} == 0x003E) { # >
4879     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4880     !!!next-input-character;
4881     !!!emit ($self->{ct}); # ELEMENT
4882     redo A;
4883     } elsif ($self->{nc} == -1) {
4884     !!!parse-error (type => 'unclosed md'); ## TODO: type
4885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4886     !!!next-input-character;
4887     !!!emit ($self->{ct}); # ELEMENT
4888     redo A;
4889     } else {
4890     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4891     ## Stay in the state.
4892     !!!next-input-character;
4893     redo A;
4894     }
4895     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4896     if ($is_space->{$self->{nc}}) {
4897     ## Stay in the state.
4898     !!!next-input-character;
4899     redo A;
4900     } elsif ($self->{nc} == 0x0028) { # (
4901     $self->{group_depth}++;
4902     push @{$self->{ct}->{content}}, chr $self->{nc};
4903     ## Stay in the state.
4904     !!!next-input-character;
4905     redo A;
4906     } elsif ($self->{nc} == 0x007C or # |
4907     $self->{nc} == 0x002C) { # ,
4908     !!!parse-error (type => 'empty element name'); ## TODO: type
4909     ## Stay in the state.
4910     !!!next-input-character;
4911     redo A;
4912     } elsif ($self->{nc} == 0x0029) { # )
4913     !!!parse-error (type => 'empty element name'); ## TODO: type
4914     push @{$self->{ct}->{content}}, chr $self->{nc};
4915     $self->{group_depth}--;
4916     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4917     !!!next-input-character;
4918     redo A;
4919     } elsif ($self->{nc} == 0x003E) { # >
4920     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4921     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4922     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4923     !!!next-input-character;
4924     !!!emit ($self->{ct}); # ELEMENT
4925     redo A;
4926     } elsif ($self->{nc} == -1) {
4927     !!!parse-error (type => 'unclosed md'); ## TODO: type
4928     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4929     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4930     !!!next-input-character;
4931     !!!emit ($self->{ct}); # ELEMENT
4932     redo A;
4933     } else {
4934     push @{$self->{ct}->{content}}, chr $self->{nc};
4935     $self->{state} = CM_ELEMENT_NAME_STATE;
4936     !!!next-input-character;
4937     redo A;
4938     }
4939     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4940     if ($is_space->{$self->{nc}}) {
4941     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4942     !!!next-input-character;
4943     redo A;
4944     } elsif ($self->{nc} == 0x002A or # *
4945     $self->{nc} == 0x002B or # +
4946     $self->{nc} == 0x003F) { # ?
4947     push @{$self->{ct}->{content}}, chr $self->{nc};
4948     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4949     !!!next-input-character;
4950     redo A;
4951     } elsif ($self->{nc} == 0x007C or # |
4952     $self->{nc} == 0x002C) { # ,
4953     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4954     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4955     !!!next-input-character;
4956     redo A;
4957     } elsif ($self->{nc} == 0x0029) { # )
4958     $self->{group_depth}--;
4959     push @{$self->{ct}->{content}}, chr $self->{nc};
4960     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4961     !!!next-input-character;
4962     redo A;
4963     } elsif ($self->{nc} == 0x003E) { # >
4964     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4965     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4966     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4967     !!!next-input-character;
4968     !!!emit ($self->{ct}); # ELEMENT
4969     redo A;
4970     } elsif ($self->{nc} == -1) {
4971     !!!parse-error (type => 'unclosed md'); ## TODO: type
4972     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974     !!!next-input-character;
4975     !!!emit ($self->{ct}); # ELEMENT
4976     redo A;
4977     } else {
4978     $self->{ct}->{content}->[-1] .= chr $self->{nc};
4979     ## Stay in the state.
4980     !!!next-input-character;
4981     redo A;
4982     }
4983     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4984     if ($is_space->{$self->{nc}}) {
4985     ## Stay in the state.
4986     !!!next-input-character;
4987     redo A;
4988     } elsif ($self->{nc} == 0x007C or # |
4989     $self->{nc} == 0x002C) { # ,
4990     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4991     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4992     !!!next-input-character;
4993     redo A;
4994     } elsif ($self->{nc} == 0x0029) { # )
4995     $self->{group_depth}--;
4996     push @{$self->{ct}->{content}}, chr $self->{nc};
4997     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4998     !!!next-input-character;
4999     redo A;
5000     } elsif ($self->{nc} == 0x003E) { # >
5001     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5002     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5003     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5004     !!!next-input-character;
5005     !!!emit ($self->{ct}); # ELEMENT
5006     redo A;
5007     } elsif ($self->{nc} == -1) {
5008     !!!parse-error (type => 'unclosed md'); ## TODO: type
5009     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5010     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5011     !!!next-input-character;
5012     !!!emit ($self->{ct}); # ELEMENT
5013     redo A;
5014     } else {
5015     !!!parse-error (type => 'after element name'); ## TODO: type
5016     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017     $self->{state} = BOGUS_MD_STATE;
5018     !!!next-input-character;
5019     redo A;
5020     }
5021     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5022     if ($is_space->{$self->{nc}}) {
5023     if ($self->{group_depth}) {
5024     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5025     } else {
5026     $self->{state} = AFTER_MD_DEF_STATE;
5027     }
5028     !!!next-input-character;
5029     redo A;
5030     } elsif ($self->{nc} == 0x002A or # *
5031     $self->{nc} == 0x002B or # +
5032     $self->{nc} == 0x003F) { # ?
5033     push @{$self->{ct}->{content}}, chr $self->{nc};
5034     if ($self->{group_depth}) {
5035     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5036     } else {
5037     $self->{state} = AFTER_MD_DEF_STATE;
5038     }
5039     !!!next-input-character;
5040     redo A;
5041     } elsif ($self->{nc} == 0x0029) { # )
5042     if ($self->{group_depth}) {
5043     $self->{group_depth}--;
5044     push @{$self->{ct}->{content}}, chr $self->{nc};
5045     ## Stay in the state.
5046     !!!next-input-character;
5047     redo A;
5048     } else {
5049     !!!parse-error (type => 'string after md def'); ## TODO: type
5050     $self->{state} = BOGUS_MD_STATE;
5051     ## Reconsume.
5052     redo A;
5053     }
5054     } elsif ($self->{nc} == 0x003E) { # >
5055     if ($self->{group_depth}) {
5056     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5057     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5058     }
5059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5060     !!!next-input-character;
5061     !!!emit ($self->{ct}); # ELEMENT
5062     redo A;
5063     } elsif ($self->{nc} == -1) {
5064     !!!parse-error (type => 'unclosed md'); ## TODO: type
5065     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5066     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5067     !!!next-input-character;
5068     !!!emit ($self->{ct}); # ELEMENT
5069     redo A;
5070     } else {
5071     if ($self->{group_depth}) {
5072     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5073     } else {
5074     !!!parse-error (type => 'string after md def'); ## TODO: type
5075     $self->{state} = BOGUS_MD_STATE;
5076     }
5077     ## Reconsume.
5078     redo A;
5079     }
5080     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5081 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5082     ## Stay in the state.
5083     !!!next-input-character;
5084     redo A;
5085     } elsif ($self->{nc} == 0x003E) { # >
5086     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5087     !!!next-input-character;
5088 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5089 wakaba 1.18 redo A;
5090     } elsif ($self->{nc} == -1) {
5091     !!!parse-error (type => 'unclosed md'); ## TODO: type
5092     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5093     !!!next-input-character;
5094 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5095 wakaba 1.18 redo A;
5096     } else {
5097 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5098 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5099     ## Reconsume.
5100     redo A;
5101     }
5102 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5103     if ($self->{nc} == 0x003E) { # >
5104     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5105     !!!next-input-character;
5106     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5107     redo A;
5108     } elsif ($self->{nc} == -1) {
5109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5110     ## Reconsume.
5111     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5112     redo A;
5113     } else {
5114     ## Stay in the state.
5115     !!!next-input-character;
5116     redo A;
5117     }
5118 wakaba 1.1 } else {
5119     die "$0: $self->{state}: Unknown state";
5120     }
5121     } # A
5122    
5123     die "$0: _get_next_token: unexpected case";
5124     } # _get_next_token
5125    
5126     1;
5127 wakaba 1.27 ## $Date: 2009/07/02 21:42:43 $
5128 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24