/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (hide annotations) (download) (as text)
Sun Oct 19 15:17:01 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.24: +25 -7 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	19 Oct 2008 15:16:55 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Normalization tests added.  Test
	results updated.

	* charrefs-1.dat: Character reference parse error/mapping tests
	added.

	* attlists-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 15:13:57 -0000
	* Tokenizer.pm.src: Normalize white space characters in attribute
	value literals in XML documents.  Don't apply character reference
	mapping table for non-NULL non-surrogate code points.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.25 our $VERSION=do{my @r=(q$Revision: 1.24 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273     if ($self->{nc} == 0x003D) { # =
1274     !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319 wakaba 1.25 } elsif ($self->{is_xml} and
1320     $is_space->{$self->{nc}}) {
1321     !!!cp (97.1);
1322     $self->{ca}->{value} .= ' ';
1323     ## Stay in the state.
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.1 } elsif ($self->{nc} == -1) {
1327     !!!parse-error (type => 'unclosed attribute value');
1328     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329     !!!cp (97);
1330     $self->{last_stag_name} = $self->{ct}->{tag_name};
1331 wakaba 1.15
1332     $self->{state} = DATA_STATE;
1333     $self->{s_kwd} = '';
1334     ## reconsume
1335     !!!emit ($self->{ct}); # start tag
1336     redo A;
1337 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340     !!!cp (98);
1341     !!!parse-error (type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344     !!!cp (99);
1345     }
1346 wakaba 1.15
1347     $self->{state} = DATA_STATE;
1348     $self->{s_kwd} = '';
1349     ## reconsume
1350     !!!emit ($self->{ct}); # end tag
1351     redo A;
1352     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353     ## XML5: No parse error above; not defined yet.
1354     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356     ## Reconsume.
1357     !!!emit ($self->{ct}); # ATTLIST
1358     redo A;
1359 wakaba 1.1 } else {
1360     die "$0: $self->{ct}->{type}: Unknown token type";
1361     }
1362     } else {
1363 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1364 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365     !!!cp (100);
1366     ## XML5: Not a parse error.
1367     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368     } else {
1369     !!!cp (100.1);
1370     }
1371 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1372     $self->{read_until}->($self->{ca}->{value},
1373 wakaba 1.25 qq["&<\x09\x0C\x20],
1374 wakaba 1.1 length $self->{ca}->{value});
1375    
1376     ## Stay in the state
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382     ## ATTLIST attribute value single quoted state".
1383 wakaba 1.11
1384 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1385 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386     !!!cp (101.1);
1387     ## XML5: "DOCTYPE ATTLIST name after state".
1388     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390     } else {
1391     !!!cp (101);
1392     ## XML5: "Before attribute name state" (sic).
1393     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394     }
1395 wakaba 1.1 !!!next-input-character;
1396     redo A;
1397     } elsif ($self->{nc} == 0x0026) { # &
1398     !!!cp (102);
1399 wakaba 1.11 ## XML5: Not defined yet.
1400    
1401 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1402     ## "entity in attribute value state". In this implementation, the
1403     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404     ## implementation of the "consume a character reference" algorithm.
1405     $self->{entity_add} = 0x0027; # '
1406     $self->{prev_state} = $self->{state};
1407     $self->{state} = ENTITY_STATE;
1408     !!!next-input-character;
1409     redo A;
1410 wakaba 1.25 } elsif ($self->{is_xml} and
1411     $is_space->{$self->{nc}}) {
1412     !!!cp (103.1);
1413     $self->{ca}->{value} .= ' ';
1414     ## Stay in the state.
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.1 } elsif ($self->{nc} == -1) {
1418     !!!parse-error (type => 'unclosed attribute value');
1419     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420     !!!cp (103);
1421     $self->{last_stag_name} = $self->{ct}->{tag_name};
1422 wakaba 1.15
1423     $self->{state} = DATA_STATE;
1424     $self->{s_kwd} = '';
1425     ## reconsume
1426     !!!emit ($self->{ct}); # start tag
1427     redo A;
1428 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430     if ($self->{ct}->{attributes}) {
1431     !!!cp (104);
1432     !!!parse-error (type => 'end tag attribute');
1433     } else {
1434     ## NOTE: This state should never be reached.
1435     !!!cp (105);
1436     }
1437 wakaba 1.15
1438     $self->{state} = DATA_STATE;
1439     $self->{s_kwd} = '';
1440     ## reconsume
1441     !!!emit ($self->{ct}); # end tag
1442     redo A;
1443     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444     ## XML5: No parse error above; not defined yet.
1445     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447     ## Reconsume.
1448     !!!emit ($self->{ct}); # ATTLIST
1449     redo A;
1450 wakaba 1.1 } else {
1451     die "$0: $self->{ct}->{type}: Unknown token type";
1452     }
1453     } else {
1454 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1455 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456     !!!cp (106);
1457     ## XML5: Not a parse error.
1458     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459     } else {
1460     !!!cp (106.1);
1461     }
1462 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1463     $self->{read_until}->($self->{ca}->{value},
1464 wakaba 1.25 qq['&<\x09\x0C\x20],
1465 wakaba 1.1 length $self->{ca}->{value});
1466    
1467     ## Stay in the state
1468     !!!next-input-character;
1469     redo A;
1470     }
1471     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1473    
1474 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1475 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476     !!!cp (107.1);
1477     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479     } else {
1480     !!!cp (107);
1481     ## XML5: "Tag attribute name before state".
1482     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483     }
1484 wakaba 1.1 !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x0026) { # &
1487     !!!cp (108);
1488 wakaba 1.11
1489     ## XML5: Not defined yet.
1490    
1491 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1492     ## "entity in attribute value state". In this implementation, the
1493     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494     ## implementation of the "consume a character reference" algorithm.
1495     $self->{entity_add} = -1;
1496     $self->{prev_state} = $self->{state};
1497     $self->{state} = ENTITY_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x003E) { # >
1501     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502     !!!cp (109);
1503     $self->{last_stag_name} = $self->{ct}->{tag_name};
1504 wakaba 1.15
1505     $self->{state} = DATA_STATE;
1506     $self->{s_kwd} = '';
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # start tag
1509     redo A;
1510 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513     !!!cp (110);
1514     !!!parse-error (type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517     !!!cp (111);
1518     }
1519 wakaba 1.15
1520     $self->{state} = DATA_STATE;
1521     $self->{s_kwd} = '';
1522     !!!next-input-character;
1523     !!!emit ($self->{ct}); # end tag
1524     redo A;
1525     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528     !!!next-input-character;
1529     !!!emit ($self->{ct}); # ATTLIST
1530     redo A;
1531 wakaba 1.1 } else {
1532     die "$0: $self->{ct}->{type}: Unknown token type";
1533     }
1534     } elsif ($self->{nc} == -1) {
1535     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536     !!!cp (112);
1537 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1538 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539 wakaba 1.15
1540     $self->{state} = DATA_STATE;
1541     $self->{s_kwd} = '';
1542     ## reconsume
1543     !!!emit ($self->{ct}); # start tag
1544     redo A;
1545 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1547 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548     if ($self->{ct}->{attributes}) {
1549     !!!cp (113);
1550     !!!parse-error (type => 'end tag attribute');
1551     } else {
1552     ## NOTE: This state should never be reached.
1553     !!!cp (114);
1554     }
1555 wakaba 1.15
1556     $self->{state} = DATA_STATE;
1557     $self->{s_kwd} = '';
1558     ## reconsume
1559     !!!emit ($self->{ct}); # end tag
1560     redo A;
1561     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562     !!!parse-error (type => 'unclosed md'); ## TODO: type
1563     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565     ## Reconsume.
1566     !!!emit ($self->{ct}); # ATTLIST
1567     redo A;
1568 wakaba 1.1 } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     } else {
1572     if ({
1573     0x0022 => 1, # "
1574     0x0027 => 1, # '
1575     0x003D => 1, # =
1576     }->{$self->{nc}}) {
1577     !!!cp (115);
1578 wakaba 1.11 ## XML5: Not a parse error.
1579 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1580     } else {
1581     !!!cp (116);
1582     }
1583     $self->{ca}->{value} .= chr ($self->{nc});
1584     $self->{read_until}->($self->{ca}->{value},
1585 wakaba 1.25 qq["'=& \x09\x0C>],
1586 wakaba 1.1 length $self->{ca}->{value});
1587    
1588     ## Stay in the state
1589     !!!next-input-character;
1590     redo A;
1591     }
1592     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1593     if ($is_space->{$self->{nc}}) {
1594     !!!cp (118);
1595     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1596     !!!next-input-character;
1597     redo A;
1598     } elsif ($self->{nc} == 0x003E) { # >
1599     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1600     !!!cp (119);
1601     $self->{last_stag_name} = $self->{ct}->{tag_name};
1602     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1603     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1604     if ($self->{ct}->{attributes}) {
1605     !!!cp (120);
1606     !!!parse-error (type => 'end tag attribute');
1607     } else {
1608     ## NOTE: This state should never be reached.
1609     !!!cp (121);
1610     }
1611     } else {
1612     die "$0: $self->{ct}->{type}: Unknown token type";
1613     }
1614     $self->{state} = DATA_STATE;
1615 wakaba 1.5 $self->{s_kwd} = '';
1616 wakaba 1.1 !!!next-input-character;
1617    
1618     !!!emit ($self->{ct}); # start tag or end tag
1619    
1620     redo A;
1621     } elsif ($self->{nc} == 0x002F) { # /
1622     !!!cp (122);
1623     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1624     !!!next-input-character;
1625     redo A;
1626     } elsif ($self->{nc} == -1) {
1627     !!!parse-error (type => 'unclosed tag');
1628     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1629     !!!cp (122.3);
1630     $self->{last_stag_name} = $self->{ct}->{tag_name};
1631     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1632     if ($self->{ct}->{attributes}) {
1633     !!!cp (122.1);
1634     !!!parse-error (type => 'end tag attribute');
1635     } else {
1636     ## NOTE: This state should never be reached.
1637     !!!cp (122.2);
1638     }
1639     } else {
1640     die "$0: $self->{ct}->{type}: Unknown token type";
1641     }
1642     $self->{state} = DATA_STATE;
1643 wakaba 1.5 $self->{s_kwd} = '';
1644 wakaba 1.1 ## Reconsume.
1645     !!!emit ($self->{ct}); # start tag or end tag
1646     redo A;
1647     } else {
1648     !!!cp ('124.1');
1649     !!!parse-error (type => 'no space between attributes');
1650     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1651     ## reconsume
1652     redo A;
1653     }
1654     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1655 wakaba 1.11 ## XML5: "Empty tag state".
1656    
1657 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1658     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1659     !!!cp ('124.2');
1660     !!!parse-error (type => 'nestc', token => $self->{ct});
1661     ## TODO: Different type than slash in start tag
1662     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1663     if ($self->{ct}->{attributes}) {
1664     !!!cp ('124.4');
1665     !!!parse-error (type => 'end tag attribute');
1666     } else {
1667     !!!cp ('124.5');
1668     }
1669     ## TODO: Test |<title></title/>|
1670     } else {
1671     !!!cp ('124.3');
1672     $self->{self_closing} = 1;
1673     }
1674    
1675     $self->{state} = DATA_STATE;
1676 wakaba 1.5 $self->{s_kwd} = '';
1677 wakaba 1.1 !!!next-input-character;
1678    
1679     !!!emit ($self->{ct}); # start tag or end tag
1680    
1681     redo A;
1682     } elsif ($self->{nc} == -1) {
1683     !!!parse-error (type => 'unclosed tag');
1684     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1685     !!!cp (124.7);
1686     $self->{last_stag_name} = $self->{ct}->{tag_name};
1687     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1688     if ($self->{ct}->{attributes}) {
1689     !!!cp (124.5);
1690     !!!parse-error (type => 'end tag attribute');
1691     } else {
1692     ## NOTE: This state should never be reached.
1693     !!!cp (124.6);
1694     }
1695     } else {
1696     die "$0: $self->{ct}->{type}: Unknown token type";
1697     }
1698 wakaba 1.11 ## XML5: "Tag attribute name before state".
1699 wakaba 1.1 $self->{state} = DATA_STATE;
1700 wakaba 1.5 $self->{s_kwd} = '';
1701 wakaba 1.1 ## Reconsume.
1702     !!!emit ($self->{ct}); # start tag or end tag
1703     redo A;
1704     } else {
1705     !!!cp ('124.4');
1706     !!!parse-error (type => 'nestc');
1707     ## TODO: This error type is wrong.
1708     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1709     ## Reconsume.
1710     redo A;
1711     }
1712     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1713 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1714    
1715 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1716     ## consumes characters one-by-one basis.
1717    
1718     if ($self->{nc} == 0x003E) { # >
1719 wakaba 1.13 if ($self->{in_subset}) {
1720     !!!cp (123);
1721     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1722     } else {
1723     !!!cp (124);
1724     $self->{state} = DATA_STATE;
1725     $self->{s_kwd} = '';
1726     }
1727 wakaba 1.1 !!!next-input-character;
1728    
1729     !!!emit ($self->{ct}); # comment
1730     redo A;
1731     } elsif ($self->{nc} == -1) {
1732 wakaba 1.13 if ($self->{in_subset}) {
1733     !!!cp (125.1);
1734     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1735     } else {
1736     !!!cp (125);
1737     $self->{state} = DATA_STATE;
1738     $self->{s_kwd} = '';
1739     }
1740 wakaba 1.1 ## reconsume
1741    
1742     !!!emit ($self->{ct}); # comment
1743     redo A;
1744     } else {
1745     !!!cp (126);
1746     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1747     $self->{read_until}->($self->{ct}->{data},
1748     q[>],
1749     length $self->{ct}->{data});
1750    
1751     ## Stay in the state.
1752     !!!next-input-character;
1753     redo A;
1754     }
1755     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1756 wakaba 1.14 ## XML5: "Markup declaration state".
1757 wakaba 1.1
1758     if ($self->{nc} == 0x002D) { # -
1759     !!!cp (133);
1760     $self->{state} = MD_HYPHEN_STATE;
1761     !!!next-input-character;
1762     redo A;
1763     } elsif ($self->{nc} == 0x0044 or # D
1764     $self->{nc} == 0x0064) { # d
1765     ## ASCII case-insensitive.
1766     !!!cp (130);
1767     $self->{state} = MD_DOCTYPE_STATE;
1768 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1769 wakaba 1.1 !!!next-input-character;
1770     redo A;
1771 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1772     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1773     $self->{is_xml}) and
1774 wakaba 1.1 $self->{nc} == 0x005B) { # [
1775     !!!cp (135.4);
1776     $self->{state} = MD_CDATA_STATE;
1777 wakaba 1.12 $self->{kwd} = '[';
1778 wakaba 1.1 !!!next-input-character;
1779     redo A;
1780     } else {
1781     !!!cp (136);
1782     }
1783    
1784     !!!parse-error (type => 'bogus comment',
1785     line => $self->{line_prev},
1786     column => $self->{column_prev} - 1);
1787     ## Reconsume.
1788     $self->{state} = BOGUS_COMMENT_STATE;
1789     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1790     line => $self->{line_prev},
1791     column => $self->{column_prev} - 1,
1792     };
1793     redo A;
1794     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1795     if ($self->{nc} == 0x002D) { # -
1796     !!!cp (127);
1797     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1798     line => $self->{line_prev},
1799     column => $self->{column_prev} - 2,
1800     };
1801 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1802 wakaba 1.1 !!!next-input-character;
1803     redo A;
1804     } else {
1805     !!!cp (128);
1806     !!!parse-error (type => 'bogus comment',
1807     line => $self->{line_prev},
1808     column => $self->{column_prev} - 2);
1809     $self->{state} = BOGUS_COMMENT_STATE;
1810     ## Reconsume.
1811     $self->{ct} = {type => COMMENT_TOKEN,
1812     data => '-',
1813     line => $self->{line_prev},
1814     column => $self->{column_prev} - 2,
1815     };
1816     redo A;
1817     }
1818     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1819     ## ASCII case-insensitive.
1820     if ($self->{nc} == [
1821     undef,
1822     0x004F, # O
1823     0x0043, # C
1824     0x0054, # T
1825     0x0059, # Y
1826     0x0050, # P
1827 wakaba 1.12 ]->[length $self->{kwd}] or
1828 wakaba 1.1 $self->{nc} == [
1829     undef,
1830     0x006F, # o
1831     0x0063, # c
1832     0x0074, # t
1833     0x0079, # y
1834     0x0070, # p
1835 wakaba 1.12 ]->[length $self->{kwd}]) {
1836 wakaba 1.1 !!!cp (131);
1837     ## Stay in the state.
1838 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1839 wakaba 1.1 !!!next-input-character;
1840     redo A;
1841 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1842 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1843     $self->{nc} == 0x0065)) { # e
1844 wakaba 1.12 if ($self->{is_xml} and
1845     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1846 wakaba 1.10 !!!cp (129);
1847     ## XML5: case-sensitive.
1848     !!!parse-error (type => 'lowercase keyword', ## TODO
1849     text => 'DOCTYPE',
1850     line => $self->{line_prev},
1851     column => $self->{column_prev} - 5);
1852     } else {
1853     !!!cp (129.1);
1854     }
1855 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1856     $self->{ct} = {type => DOCTYPE_TOKEN,
1857     quirks => 1,
1858     line => $self->{line_prev},
1859     column => $self->{column_prev} - 7,
1860     };
1861     !!!next-input-character;
1862     redo A;
1863     } else {
1864     !!!cp (132);
1865     !!!parse-error (type => 'bogus comment',
1866     line => $self->{line_prev},
1867 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1868 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1869     ## Reconsume.
1870     $self->{ct} = {type => COMMENT_TOKEN,
1871 wakaba 1.12 data => $self->{kwd},
1872 wakaba 1.1 line => $self->{line_prev},
1873 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1874 wakaba 1.1 };
1875     redo A;
1876     }
1877     } elsif ($self->{state} == MD_CDATA_STATE) {
1878     if ($self->{nc} == {
1879     '[' => 0x0043, # C
1880     '[C' => 0x0044, # D
1881     '[CD' => 0x0041, # A
1882     '[CDA' => 0x0054, # T
1883     '[CDAT' => 0x0041, # A
1884 wakaba 1.12 }->{$self->{kwd}}) {
1885 wakaba 1.1 !!!cp (135.1);
1886     ## Stay in the state.
1887 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1888 wakaba 1.1 !!!next-input-character;
1889     redo A;
1890 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1891 wakaba 1.1 $self->{nc} == 0x005B) { # [
1892 wakaba 1.6 if ($self->{is_xml} and
1893     not $self->{tainted} and
1894     @{$self->{open_elements} or []} == 0) {
1895 wakaba 1.8 !!!cp (135.2);
1896 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1897     line => $self->{line_prev},
1898     column => $self->{column_prev} - 7);
1899     $self->{tainted} = 1;
1900 wakaba 1.8 } else {
1901     !!!cp (135.21);
1902 wakaba 1.6 }
1903    
1904 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1905     data => '',
1906     line => $self->{line_prev},
1907     column => $self->{column_prev} - 7};
1908     $self->{state} = CDATA_SECTION_STATE;
1909     !!!next-input-character;
1910     redo A;
1911     } else {
1912     !!!cp (135.3);
1913     !!!parse-error (type => 'bogus comment',
1914     line => $self->{line_prev},
1915 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1916 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1917     ## Reconsume.
1918     $self->{ct} = {type => COMMENT_TOKEN,
1919 wakaba 1.12 data => $self->{kwd},
1920 wakaba 1.1 line => $self->{line_prev},
1921 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1922 wakaba 1.1 };
1923     redo A;
1924     }
1925     } elsif ($self->{state} == COMMENT_START_STATE) {
1926     if ($self->{nc} == 0x002D) { # -
1927     !!!cp (137);
1928     $self->{state} = COMMENT_START_DASH_STATE;
1929     !!!next-input-character;
1930     redo A;
1931     } elsif ($self->{nc} == 0x003E) { # >
1932     !!!parse-error (type => 'bogus comment');
1933 wakaba 1.13 if ($self->{in_subset}) {
1934     !!!cp (138.1);
1935     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1936     } else {
1937     !!!cp (138);
1938     $self->{state} = DATA_STATE;
1939     $self->{s_kwd} = '';
1940     }
1941 wakaba 1.1 !!!next-input-character;
1942    
1943     !!!emit ($self->{ct}); # comment
1944    
1945     redo A;
1946     } elsif ($self->{nc} == -1) {
1947     !!!parse-error (type => 'unclosed comment');
1948 wakaba 1.13 if ($self->{in_subset}) {
1949     !!!cp (139.1);
1950     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1951     } else {
1952     !!!cp (139);
1953     $self->{state} = DATA_STATE;
1954     $self->{s_kwd} = '';
1955     }
1956 wakaba 1.1 ## reconsume
1957    
1958     !!!emit ($self->{ct}); # comment
1959    
1960     redo A;
1961     } else {
1962     !!!cp (140);
1963     $self->{ct}->{data} # comment
1964     .= chr ($self->{nc});
1965     $self->{state} = COMMENT_STATE;
1966     !!!next-input-character;
1967     redo A;
1968     }
1969     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1970     if ($self->{nc} == 0x002D) { # -
1971     !!!cp (141);
1972     $self->{state} = COMMENT_END_STATE;
1973     !!!next-input-character;
1974     redo A;
1975     } elsif ($self->{nc} == 0x003E) { # >
1976     !!!parse-error (type => 'bogus comment');
1977 wakaba 1.13 if ($self->{in_subset}) {
1978     !!!cp (142.1);
1979     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1980     } else {
1981     !!!cp (142);
1982     $self->{state} = DATA_STATE;
1983     $self->{s_kwd} = '';
1984     }
1985 wakaba 1.1 !!!next-input-character;
1986    
1987     !!!emit ($self->{ct}); # comment
1988    
1989     redo A;
1990     } elsif ($self->{nc} == -1) {
1991     !!!parse-error (type => 'unclosed comment');
1992 wakaba 1.13 if ($self->{in_subset}) {
1993     !!!cp (143.1);
1994     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1995     } else {
1996     !!!cp (143);
1997     $self->{state} = DATA_STATE;
1998     $self->{s_kwd} = '';
1999     }
2000 wakaba 1.1 ## reconsume
2001    
2002     !!!emit ($self->{ct}); # comment
2003    
2004     redo A;
2005     } else {
2006     !!!cp (144);
2007     $self->{ct}->{data} # comment
2008     .= '-' . chr ($self->{nc});
2009     $self->{state} = COMMENT_STATE;
2010     !!!next-input-character;
2011     redo A;
2012     }
2013     } elsif ($self->{state} == COMMENT_STATE) {
2014 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2015    
2016 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2017     !!!cp (145);
2018     $self->{state} = COMMENT_END_DASH_STATE;
2019     !!!next-input-character;
2020     redo A;
2021     } elsif ($self->{nc} == -1) {
2022     !!!parse-error (type => 'unclosed comment');
2023 wakaba 1.13 if ($self->{in_subset}) {
2024     !!!cp (146.1);
2025     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2026     } else {
2027     !!!cp (146);
2028     $self->{state} = DATA_STATE;
2029     $self->{s_kwd} = '';
2030     }
2031 wakaba 1.1 ## reconsume
2032    
2033     !!!emit ($self->{ct}); # comment
2034    
2035     redo A;
2036     } else {
2037     !!!cp (147);
2038     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2039     $self->{read_until}->($self->{ct}->{data},
2040     q[-],
2041     length $self->{ct}->{data});
2042    
2043     ## Stay in the state
2044     !!!next-input-character;
2045     redo A;
2046     }
2047     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2048 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2049 wakaba 1.10
2050 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2051     !!!cp (148);
2052     $self->{state} = COMMENT_END_STATE;
2053     !!!next-input-character;
2054     redo A;
2055     } elsif ($self->{nc} == -1) {
2056     !!!parse-error (type => 'unclosed comment');
2057 wakaba 1.13 if ($self->{in_subset}) {
2058     !!!cp (149.1);
2059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2060     } else {
2061     !!!cp (149);
2062     $self->{state} = DATA_STATE;
2063     $self->{s_kwd} = '';
2064     }
2065 wakaba 1.1 ## reconsume
2066    
2067     !!!emit ($self->{ct}); # comment
2068    
2069     redo A;
2070     } else {
2071     !!!cp (150);
2072     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2073     $self->{state} = COMMENT_STATE;
2074     !!!next-input-character;
2075     redo A;
2076     }
2077     } elsif ($self->{state} == COMMENT_END_STATE) {
2078 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2079    
2080 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2081 wakaba 1.13 if ($self->{in_subset}) {
2082     !!!cp (151.1);
2083     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2084     } else {
2085     !!!cp (151);
2086     $self->{state} = DATA_STATE;
2087     $self->{s_kwd} = '';
2088     }
2089 wakaba 1.1 !!!next-input-character;
2090    
2091     !!!emit ($self->{ct}); # comment
2092    
2093     redo A;
2094     } elsif ($self->{nc} == 0x002D) { # -
2095     !!!cp (152);
2096 wakaba 1.10 ## XML5: Not a parse error.
2097 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2098     line => $self->{line_prev},
2099     column => $self->{column_prev});
2100     $self->{ct}->{data} .= '-'; # comment
2101     ## Stay in the state
2102     !!!next-input-character;
2103     redo A;
2104     } elsif ($self->{nc} == -1) {
2105     !!!parse-error (type => 'unclosed comment');
2106 wakaba 1.13 if ($self->{in_subset}) {
2107     !!!cp (153.1);
2108     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2109     } else {
2110     !!!cp (153);
2111     $self->{state} = DATA_STATE;
2112     $self->{s_kwd} = '';
2113     }
2114 wakaba 1.1 ## reconsume
2115    
2116     !!!emit ($self->{ct}); # comment
2117    
2118     redo A;
2119     } else {
2120     !!!cp (154);
2121 wakaba 1.10 ## XML5: Not a parse error.
2122 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2123     line => $self->{line_prev},
2124     column => $self->{column_prev});
2125     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2126     $self->{state} = COMMENT_STATE;
2127     !!!next-input-character;
2128     redo A;
2129     }
2130     } elsif ($self->{state} == DOCTYPE_STATE) {
2131     if ($is_space->{$self->{nc}}) {
2132     !!!cp (155);
2133     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2134     !!!next-input-character;
2135     redo A;
2136     } else {
2137     !!!cp (156);
2138 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2139 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2140     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2141     ## reconsume
2142     redo A;
2143     }
2144     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2145 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2146    
2147 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2148     !!!cp (157);
2149     ## Stay in the state
2150     !!!next-input-character;
2151     redo A;
2152     } elsif ($self->{nc} == 0x003E) { # >
2153     !!!cp (158);
2154 wakaba 1.12 ## XML5: No parse error.
2155 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2156     $self->{state} = DATA_STATE;
2157 wakaba 1.5 $self->{s_kwd} = '';
2158 wakaba 1.1 !!!next-input-character;
2159    
2160     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2161    
2162     redo A;
2163     } elsif ($self->{nc} == -1) {
2164     !!!cp (159);
2165     !!!parse-error (type => 'no DOCTYPE name');
2166     $self->{state} = DATA_STATE;
2167 wakaba 1.5 $self->{s_kwd} = '';
2168 wakaba 1.1 ## reconsume
2169    
2170     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2171    
2172     redo A;
2173 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2174     !!!cp (159.1);
2175     !!!parse-error (type => 'no DOCTYPE name');
2176     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2177 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2178     $self->{in_subset} = 1;
2179 wakaba 1.12 !!!next-input-character;
2180 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2181 wakaba 1.12 redo A;
2182 wakaba 1.1 } else {
2183     !!!cp (160);
2184     $self->{ct}->{name} = chr $self->{nc};
2185     delete $self->{ct}->{quirks};
2186     $self->{state} = DOCTYPE_NAME_STATE;
2187     !!!next-input-character;
2188     redo A;
2189     }
2190     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2191 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2192    
2193     ## ISSUE: Redundant "First," in the spec.
2194    
2195 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2196     !!!cp (161);
2197     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2198     !!!next-input-character;
2199     redo A;
2200     } elsif ($self->{nc} == 0x003E) { # >
2201     !!!cp (162);
2202     $self->{state} = DATA_STATE;
2203 wakaba 1.5 $self->{s_kwd} = '';
2204 wakaba 1.1 !!!next-input-character;
2205    
2206     !!!emit ($self->{ct}); # DOCTYPE
2207    
2208     redo A;
2209     } elsif ($self->{nc} == -1) {
2210     !!!cp (163);
2211     !!!parse-error (type => 'unclosed DOCTYPE');
2212     $self->{state} = DATA_STATE;
2213 wakaba 1.5 $self->{s_kwd} = '';
2214 wakaba 1.1 ## reconsume
2215    
2216     $self->{ct}->{quirks} = 1;
2217     !!!emit ($self->{ct}); # DOCTYPE
2218    
2219     redo A;
2220 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2221     !!!cp (163.1);
2222     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2223 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2224     $self->{in_subset} = 1;
2225 wakaba 1.12 !!!next-input-character;
2226 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2227 wakaba 1.12 redo A;
2228 wakaba 1.1 } else {
2229     !!!cp (164);
2230     $self->{ct}->{name}
2231     .= chr ($self->{nc}); # DOCTYPE
2232     ## Stay in the state
2233     !!!next-input-character;
2234     redo A;
2235     }
2236     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2237 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2238     ## state", but implemented differently.
2239    
2240 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2241     !!!cp (165);
2242     ## Stay in the state
2243     !!!next-input-character;
2244     redo A;
2245     } elsif ($self->{nc} == 0x003E) { # >
2246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2247     !!!cp (166);
2248     $self->{state} = DATA_STATE;
2249     $self->{s_kwd} = '';
2250     } else {
2251     !!!cp (166.1);
2252     !!!parse-error (type => 'no md def'); ## TODO: type
2253     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2254     }
2255    
2256 wakaba 1.1 !!!next-input-character;
2257 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2258 wakaba 1.1 redo A;
2259     } elsif ($self->{nc} == -1) {
2260 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2261     !!!cp (167);
2262     !!!parse-error (type => 'unclosed DOCTYPE');
2263     $self->{state} = DATA_STATE;
2264     $self->{s_kwd} = '';
2265     $self->{ct}->{quirks} = 1;
2266     } else {
2267     !!!cp (167.12);
2268     !!!parse-error (type => 'unclosed md'); ## TODO: type
2269     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2270     }
2271    
2272     ## Reconsume.
2273     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2274 wakaba 1.1 redo A;
2275     } elsif ($self->{nc} == 0x0050 or # P
2276     $self->{nc} == 0x0070) { # p
2277 wakaba 1.12 !!!cp (167.1);
2278 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2279 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2280 wakaba 1.1 !!!next-input-character;
2281     redo A;
2282     } elsif ($self->{nc} == 0x0053 or # S
2283     $self->{nc} == 0x0073) { # s
2284 wakaba 1.12 !!!cp (167.2);
2285 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2286 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2287     !!!next-input-character;
2288     redo A;
2289 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2290     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2291     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2292     !!!cp (167.21);
2293     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2294     $self->{ct}->{value} = ''; # ENTITY
2295     !!!next-input-character;
2296     redo A;
2297     } elsif ($self->{nc} == 0x0027 and # '
2298     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2299     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2300     !!!cp (167.22);
2301     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2302     $self->{ct}->{value} = ''; # ENTITY
2303     !!!next-input-character;
2304     redo A;
2305 wakaba 1.16 } elsif ($self->{is_xml} and
2306     $self->{ct}->{type} == DOCTYPE_TOKEN and
2307     $self->{nc} == 0x005B) { # [
2308 wakaba 1.12 !!!cp (167.3);
2309     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2310     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2311 wakaba 1.13 $self->{in_subset} = 1;
2312 wakaba 1.1 !!!next-input-character;
2313 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2314 wakaba 1.1 redo A;
2315     } else {
2316 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2317    
2318     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2319     !!!cp (180);
2320     $self->{ct}->{quirks} = 1;
2321     $self->{state} = BOGUS_DOCTYPE_STATE;
2322     } else {
2323     !!!cp (180.1);
2324     $self->{state} = BOGUS_MD_STATE;
2325     }
2326 wakaba 1.1
2327     !!!next-input-character;
2328     redo A;
2329     }
2330     } elsif ($self->{state} == PUBLIC_STATE) {
2331     ## ASCII case-insensitive
2332     if ($self->{nc} == [
2333     undef,
2334     0x0055, # U
2335     0x0042, # B
2336     0x004C, # L
2337     0x0049, # I
2338 wakaba 1.12 ]->[length $self->{kwd}] or
2339 wakaba 1.1 $self->{nc} == [
2340     undef,
2341     0x0075, # u
2342     0x0062, # b
2343     0x006C, # l
2344     0x0069, # i
2345 wakaba 1.12 ]->[length $self->{kwd}]) {
2346 wakaba 1.1 !!!cp (175);
2347     ## Stay in the state.
2348 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2349 wakaba 1.1 !!!next-input-character;
2350     redo A;
2351 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2352 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2353     $self->{nc} == 0x0063)) { # c
2354 wakaba 1.12 if ($self->{is_xml} and
2355     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2356     !!!cp (168.1);
2357     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2358     text => 'PUBLIC',
2359     line => $self->{line_prev},
2360     column => $self->{column_prev} - 4);
2361     } else {
2362     !!!cp (168);
2363     }
2364 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2365     !!!next-input-character;
2366     redo A;
2367     } else {
2368 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2369 wakaba 1.1 line => $self->{line_prev},
2370 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2371 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2372     !!!cp (169);
2373     $self->{ct}->{quirks} = 1;
2374     $self->{state} = BOGUS_DOCTYPE_STATE;
2375     } else {
2376     !!!cp (169.1);
2377     $self->{state} = BOGUS_MD_STATE;
2378     }
2379 wakaba 1.1 ## Reconsume.
2380     redo A;
2381     }
2382     } elsif ($self->{state} == SYSTEM_STATE) {
2383     ## ASCII case-insensitive
2384     if ($self->{nc} == [
2385     undef,
2386     0x0059, # Y
2387     0x0053, # S
2388     0x0054, # T
2389     0x0045, # E
2390 wakaba 1.12 ]->[length $self->{kwd}] or
2391 wakaba 1.1 $self->{nc} == [
2392     undef,
2393     0x0079, # y
2394     0x0073, # s
2395     0x0074, # t
2396     0x0065, # e
2397 wakaba 1.12 ]->[length $self->{kwd}]) {
2398 wakaba 1.1 !!!cp (170);
2399     ## Stay in the state.
2400 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2401 wakaba 1.1 !!!next-input-character;
2402     redo A;
2403 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2404 wakaba 1.1 ($self->{nc} == 0x004D or # M
2405     $self->{nc} == 0x006D)) { # m
2406 wakaba 1.12 if ($self->{is_xml} and
2407     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2408     !!!cp (171.1);
2409     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2410     text => 'SYSTEM',
2411     line => $self->{line_prev},
2412     column => $self->{column_prev} - 4);
2413     } else {
2414     !!!cp (171);
2415     }
2416 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2417     !!!next-input-character;
2418     redo A;
2419     } else {
2420 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2421 wakaba 1.1 line => $self->{line_prev},
2422 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2423 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2424     !!!cp (172);
2425     $self->{ct}->{quirks} = 1;
2426     $self->{state} = BOGUS_DOCTYPE_STATE;
2427     } else {
2428     !!!cp (172.1);
2429     $self->{state} = BOGUS_MD_STATE;
2430     }
2431 wakaba 1.1 ## Reconsume.
2432     redo A;
2433     }
2434     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2435     if ($is_space->{$self->{nc}}) {
2436     !!!cp (181);
2437     ## Stay in the state
2438     !!!next-input-character;
2439     redo A;
2440     } elsif ($self->{nc} eq 0x0022) { # "
2441     !!!cp (182);
2442     $self->{ct}->{pubid} = ''; # DOCTYPE
2443     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2444     !!!next-input-character;
2445     redo A;
2446     } elsif ($self->{nc} eq 0x0027) { # '
2447     !!!cp (183);
2448     $self->{ct}->{pubid} = ''; # DOCTYPE
2449     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2450     !!!next-input-character;
2451     redo A;
2452     } elsif ($self->{nc} eq 0x003E) { # >
2453     !!!parse-error (type => 'no PUBLIC literal');
2454 wakaba 1.16
2455     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2456     !!!cp (184);
2457     $self->{state} = DATA_STATE;
2458     $self->{s_kwd} = '';
2459     $self->{ct}->{quirks} = 1;
2460     } else {
2461     !!!cp (184.1);
2462     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2463     }
2464    
2465 wakaba 1.1 !!!next-input-character;
2466 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2467 wakaba 1.1 redo A;
2468     } elsif ($self->{nc} == -1) {
2469 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2470     !!!cp (185);
2471     !!!parse-error (type => 'unclosed DOCTYPE');
2472     $self->{state} = DATA_STATE;
2473     $self->{s_kwd} = '';
2474     $self->{ct}->{quirks} = 1;
2475     } else {
2476     !!!cp (185.1);
2477     !!!parse-error (type => 'unclosed md'); ## TODO: type
2478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2479     }
2480    
2481 wakaba 1.1 ## reconsume
2482     !!!emit ($self->{ct}); # DOCTYPE
2483     redo A;
2484 wakaba 1.16 } elsif ($self->{is_xml} and
2485     $self->{ct}->{type} == DOCTYPE_TOKEN and
2486     $self->{nc} == 0x005B) { # [
2487 wakaba 1.12 !!!cp (186.1);
2488     !!!parse-error (type => 'no PUBLIC literal');
2489     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2490     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2491 wakaba 1.13 $self->{in_subset} = 1;
2492 wakaba 1.12 !!!next-input-character;
2493 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2494 wakaba 1.12 redo A;
2495 wakaba 1.1 } else {
2496     !!!parse-error (type => 'string after PUBLIC');
2497    
2498 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499     !!!cp (186);
2500     $self->{ct}->{quirks} = 1;
2501     $self->{state} = BOGUS_DOCTYPE_STATE;
2502     } else {
2503     !!!cp (186.2);
2504     $self->{state} = BOGUS_MD_STATE;
2505     }
2506    
2507 wakaba 1.1 !!!next-input-character;
2508     redo A;
2509     }
2510     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2511     if ($self->{nc} == 0x0022) { # "
2512     !!!cp (187);
2513     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2514     !!!next-input-character;
2515     redo A;
2516     } elsif ($self->{nc} == 0x003E) { # >
2517     !!!parse-error (type => 'unclosed PUBLIC literal');
2518    
2519 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2520     !!!cp (188);
2521     $self->{state} = DATA_STATE;
2522     $self->{s_kwd} = '';
2523     $self->{ct}->{quirks} = 1;
2524     } else {
2525     !!!cp (188.1);
2526     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2527     }
2528    
2529 wakaba 1.1 !!!next-input-character;
2530 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2531 wakaba 1.1 redo A;
2532     } elsif ($self->{nc} == -1) {
2533     !!!parse-error (type => 'unclosed PUBLIC literal');
2534    
2535 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2536     !!!cp (189);
2537     $self->{state} = DATA_STATE;
2538     $self->{s_kwd} = '';
2539     $self->{ct}->{quirks} = 1;
2540     } else {
2541     !!!cp (189.1);
2542     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2543     }
2544    
2545     ## Reconsume.
2546 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2547     redo A;
2548     } else {
2549     !!!cp (190);
2550 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2551 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2552     length $self->{ct}->{pubid});
2553    
2554     ## Stay in the state
2555     !!!next-input-character;
2556     redo A;
2557     }
2558     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2559     if ($self->{nc} == 0x0027) { # '
2560     !!!cp (191);
2561     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2562     !!!next-input-character;
2563     redo A;
2564     } elsif ($self->{nc} == 0x003E) { # >
2565     !!!parse-error (type => 'unclosed PUBLIC literal');
2566    
2567 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2568     !!!cp (192);
2569     $self->{state} = DATA_STATE;
2570     $self->{s_kwd} = '';
2571     $self->{ct}->{quirks} = 1;
2572     } else {
2573     !!!cp (192.1);
2574     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2575     }
2576    
2577 wakaba 1.1 !!!next-input-character;
2578 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2579 wakaba 1.1 redo A;
2580     } elsif ($self->{nc} == -1) {
2581     !!!parse-error (type => 'unclosed PUBLIC literal');
2582    
2583 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2584     !!!cp (193);
2585     $self->{state} = DATA_STATE;
2586     $self->{s_kwd} = '';
2587     $self->{ct}->{quirks} = 1;
2588     } else {
2589     !!!cp (193.1);
2590     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2591     }
2592    
2593 wakaba 1.1 ## reconsume
2594 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2595 wakaba 1.1 redo A;
2596     } else {
2597     !!!cp (194);
2598 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2599 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2600     length $self->{ct}->{pubid});
2601    
2602     ## Stay in the state
2603     !!!next-input-character;
2604     redo A;
2605     }
2606     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2607     if ($is_space->{$self->{nc}}) {
2608     !!!cp (195);
2609     ## Stay in the state
2610     !!!next-input-character;
2611     redo A;
2612     } elsif ($self->{nc} == 0x0022) { # "
2613     !!!cp (196);
2614 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2615 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2616     !!!next-input-character;
2617     redo A;
2618     } elsif ($self->{nc} == 0x0027) { # '
2619     !!!cp (197);
2620 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2621 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2622     !!!next-input-character;
2623     redo A;
2624     } elsif ($self->{nc} == 0x003E) { # >
2625 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2626     if ($self->{is_xml}) {
2627     !!!cp (198.1);
2628     !!!parse-error (type => 'no SYSTEM literal');
2629     } else {
2630     !!!cp (198);
2631     }
2632     $self->{state} = DATA_STATE;
2633     $self->{s_kwd} = '';
2634 wakaba 1.12 } else {
2635 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2636     !!!cp (198.2);
2637     } else {
2638     !!!cp (198.3);
2639     !!!parse-error (type => 'no SYSTEM literal');
2640     }
2641     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2642 wakaba 1.12 }
2643 wakaba 1.16
2644 wakaba 1.1 !!!next-input-character;
2645 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2646 wakaba 1.1 redo A;
2647     } elsif ($self->{nc} == -1) {
2648 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2649     !!!cp (199);
2650     !!!parse-error (type => 'unclosed DOCTYPE');
2651    
2652     $self->{state} = DATA_STATE;
2653     $self->{s_kwd} = '';
2654     $self->{ct}->{quirks} = 1;
2655     } else {
2656     !!!parse-error (type => 'unclosed md'); ## TODO: type
2657     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2658     }
2659    
2660 wakaba 1.1 ## reconsume
2661 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2662 wakaba 1.1 redo A;
2663 wakaba 1.16 } elsif ($self->{is_xml} and
2664     $self->{ct}->{type} == DOCTYPE_TOKEN and
2665     $self->{nc} == 0x005B) { # [
2666 wakaba 1.12 !!!cp (200.1);
2667     !!!parse-error (type => 'no SYSTEM literal');
2668     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2669     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2670 wakaba 1.13 $self->{in_subset} = 1;
2671 wakaba 1.12 !!!next-input-character;
2672 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2673 wakaba 1.12 redo A;
2674 wakaba 1.1 } else {
2675     !!!parse-error (type => 'string after PUBLIC literal');
2676    
2677 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2678     !!!cp (200);
2679     $self->{ct}->{quirks} = 1;
2680     $self->{state} = BOGUS_DOCTYPE_STATE;
2681     } else {
2682     !!!cp (200.2);
2683     $self->{state} = BOGUS_MD_STATE;
2684     }
2685    
2686 wakaba 1.1 !!!next-input-character;
2687     redo A;
2688     }
2689     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2690     if ($is_space->{$self->{nc}}) {
2691     !!!cp (201);
2692     ## Stay in the state
2693     !!!next-input-character;
2694     redo A;
2695     } elsif ($self->{nc} == 0x0022) { # "
2696     !!!cp (202);
2697     $self->{ct}->{sysid} = ''; # DOCTYPE
2698     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2699     !!!next-input-character;
2700     redo A;
2701     } elsif ($self->{nc} == 0x0027) { # '
2702     !!!cp (203);
2703     $self->{ct}->{sysid} = ''; # DOCTYPE
2704     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2705     !!!next-input-character;
2706     redo A;
2707     } elsif ($self->{nc} == 0x003E) { # >
2708     !!!parse-error (type => 'no SYSTEM literal');
2709     !!!next-input-character;
2710    
2711 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2712     !!!cp (204);
2713     $self->{state} = DATA_STATE;
2714     $self->{s_kwd} = '';
2715     $self->{ct}->{quirks} = 1;
2716     } else {
2717     !!!cp (204.1);
2718     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2719     }
2720 wakaba 1.1
2721 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2722 wakaba 1.1 redo A;
2723     } elsif ($self->{nc} == -1) {
2724 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2725     !!!cp (205);
2726     !!!parse-error (type => 'unclosed DOCTYPE');
2727     $self->{state} = DATA_STATE;
2728     $self->{s_kwd} = '';
2729     $self->{ct}->{quirks} = 1;
2730     } else {
2731     !!!cp (205.1);
2732     !!!parse-error (type => 'unclosed md'); ## TODO: type
2733     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2734     }
2735    
2736 wakaba 1.1 ## reconsume
2737 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2738 wakaba 1.1 redo A;
2739 wakaba 1.16 } elsif ($self->{is_xml} and
2740     $self->{ct}->{type} == DOCTYPE_TOKEN and
2741     $self->{nc} == 0x005B) { # [
2742 wakaba 1.12 !!!cp (206.1);
2743     !!!parse-error (type => 'no SYSTEM literal');
2744    
2745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2747 wakaba 1.13 $self->{in_subset} = 1;
2748 wakaba 1.12 !!!next-input-character;
2749 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2750 wakaba 1.12 redo A;
2751 wakaba 1.1 } else {
2752     !!!parse-error (type => 'string after SYSTEM');
2753    
2754 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755     !!!cp (206);
2756     $self->{ct}->{quirks} = 1;
2757     $self->{state} = BOGUS_DOCTYPE_STATE;
2758     } else {
2759     !!!cp (206.2);
2760     $self->{state} = BOGUS_MD_STATE;
2761     }
2762    
2763 wakaba 1.1 !!!next-input-character;
2764     redo A;
2765     }
2766     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2767     if ($self->{nc} == 0x0022) { # "
2768     !!!cp (207);
2769     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2770     !!!next-input-character;
2771     redo A;
2772 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2773 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2774    
2775 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2776     !!!cp (208);
2777     $self->{state} = DATA_STATE;
2778     $self->{s_kwd} = '';
2779     $self->{ct}->{quirks} = 1;
2780     } else {
2781     !!!cp (208.1);
2782     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2783     }
2784    
2785 wakaba 1.1 !!!next-input-character;
2786 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2787 wakaba 1.1 redo A;
2788     } elsif ($self->{nc} == -1) {
2789     !!!parse-error (type => 'unclosed SYSTEM literal');
2790    
2791 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2792     !!!cp (209);
2793     $self->{state} = DATA_STATE;
2794     $self->{s_kwd} = '';
2795     $self->{ct}->{quirks} = 1;
2796     } else {
2797     !!!cp (209.1);
2798     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2799     }
2800    
2801 wakaba 1.1 ## reconsume
2802 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2803 wakaba 1.1 redo A;
2804     } else {
2805     !!!cp (210);
2806 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2807 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2808     length $self->{ct}->{sysid});
2809    
2810     ## Stay in the state
2811     !!!next-input-character;
2812     redo A;
2813     }
2814     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2815     if ($self->{nc} == 0x0027) { # '
2816     !!!cp (211);
2817     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2818     !!!next-input-character;
2819     redo A;
2820 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2821 wakaba 1.1 !!!cp (212);
2822     !!!parse-error (type => 'unclosed SYSTEM literal');
2823    
2824     $self->{state} = DATA_STATE;
2825 wakaba 1.5 $self->{s_kwd} = '';
2826 wakaba 1.1 !!!next-input-character;
2827    
2828     $self->{ct}->{quirks} = 1;
2829     !!!emit ($self->{ct}); # DOCTYPE
2830    
2831     redo A;
2832     } elsif ($self->{nc} == -1) {
2833     !!!parse-error (type => 'unclosed SYSTEM literal');
2834    
2835 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2836     !!!cp (213);
2837     $self->{state} = DATA_STATE;
2838     $self->{s_kwd} = '';
2839     $self->{ct}->{quirks} = 1;
2840     } else {
2841     !!!cp (213.1);
2842     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2843     }
2844    
2845 wakaba 1.1 ## reconsume
2846 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2847 wakaba 1.1 redo A;
2848     } else {
2849     !!!cp (214);
2850 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2851 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2852     length $self->{ct}->{sysid});
2853    
2854     ## Stay in the state
2855     !!!next-input-character;
2856     redo A;
2857     }
2858     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2859     if ($is_space->{$self->{nc}}) {
2860 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2861     !!!cp (215.1);
2862     $self->{state} = BEFORE_NDATA_STATE;
2863     } else {
2864     !!!cp (215);
2865     ## Stay in the state
2866     }
2867 wakaba 1.1 !!!next-input-character;
2868     redo A;
2869     } elsif ($self->{nc} == 0x003E) { # >
2870 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2871     !!!cp (216);
2872     $self->{state} = DATA_STATE;
2873     $self->{s_kwd} = '';
2874     } else {
2875     !!!cp (216.1);
2876     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2877     }
2878    
2879 wakaba 1.1 !!!next-input-character;
2880 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2881 wakaba 1.1 redo A;
2882 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2883     ($self->{nc} == 0x004E or # N
2884     $self->{nc} == 0x006E)) { # n
2885     !!!cp (216.2);
2886     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2887     $self->{state} = NDATA_STATE;
2888     $self->{kwd} = chr $self->{nc};
2889     !!!next-input-character;
2890     redo A;
2891 wakaba 1.1 } elsif ($self->{nc} == -1) {
2892 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2893     !!!cp (217);
2894     !!!parse-error (type => 'unclosed DOCTYPE');
2895     $self->{state} = DATA_STATE;
2896     $self->{s_kwd} = '';
2897     $self->{ct}->{quirks} = 1;
2898     } else {
2899     !!!cp (217.1);
2900     !!!parse-error (type => 'unclosed md'); ## TODO: type
2901     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902     }
2903    
2904 wakaba 1.1 ## reconsume
2905 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2906 wakaba 1.1 redo A;
2907 wakaba 1.16 } elsif ($self->{is_xml} and
2908     $self->{ct}->{type} == DOCTYPE_TOKEN and
2909     $self->{nc} == 0x005B) { # [
2910 wakaba 1.12 !!!cp (218.1);
2911     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2912     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2913 wakaba 1.13 $self->{in_subset} = 1;
2914 wakaba 1.12 !!!next-input-character;
2915 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2916 wakaba 1.12 redo A;
2917 wakaba 1.1 } else {
2918     !!!parse-error (type => 'string after SYSTEM literal');
2919    
2920 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2921     !!!cp (218);
2922     #$self->{ct}->{quirks} = 1;
2923     $self->{state} = BOGUS_DOCTYPE_STATE;
2924     } else {
2925     !!!cp (218.2);
2926     $self->{state} = BOGUS_MD_STATE;
2927     }
2928    
2929 wakaba 1.1 !!!next-input-character;
2930     redo A;
2931     }
2932 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2933     if ($is_space->{$self->{nc}}) {
2934     !!!cp (218.3);
2935     ## Stay in the state.
2936     !!!next-input-character;
2937     redo A;
2938     } elsif ($self->{nc} == 0x003E) { # >
2939     !!!cp (218.4);
2940     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941     !!!next-input-character;
2942     !!!emit ($self->{ct}); # ENTITY
2943     redo A;
2944     } elsif ($self->{nc} == 0x004E or # N
2945     $self->{nc} == 0x006E) { # n
2946     !!!cp (218.5);
2947     $self->{state} = NDATA_STATE;
2948     $self->{kwd} = chr $self->{nc};
2949     !!!next-input-character;
2950     redo A;
2951     } elsif ($self->{nc} == -1) {
2952     !!!cp (218.6);
2953     !!!parse-error (type => 'unclosed md'); ## TODO: type
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     ## reconsume
2956     !!!emit ($self->{ct}); # ENTITY
2957     redo A;
2958     } else {
2959     !!!cp (218.7);
2960     !!!parse-error (type => 'string after SYSTEM literal');
2961     $self->{state} = BOGUS_MD_STATE;
2962     !!!next-input-character;
2963     redo A;
2964     }
2965 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2966     if ($self->{nc} == 0x003E) { # >
2967     !!!cp (219);
2968     $self->{state} = DATA_STATE;
2969 wakaba 1.5 $self->{s_kwd} = '';
2970 wakaba 1.1 !!!next-input-character;
2971    
2972     !!!emit ($self->{ct}); # DOCTYPE
2973    
2974     redo A;
2975 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2976 wakaba 1.13 !!!cp (220.1);
2977     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2978     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2979     $self->{in_subset} = 1;
2980     !!!next-input-character;
2981     !!!emit ($self->{ct}); # DOCTYPE
2982     redo A;
2983 wakaba 1.1 } elsif ($self->{nc} == -1) {
2984     !!!cp (220);
2985     $self->{state} = DATA_STATE;
2986 wakaba 1.5 $self->{s_kwd} = '';
2987 wakaba 1.1 ## reconsume
2988    
2989     !!!emit ($self->{ct}); # DOCTYPE
2990    
2991     redo A;
2992     } else {
2993     !!!cp (221);
2994     my $s = '';
2995 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2996 wakaba 1.1
2997     ## Stay in the state
2998     !!!next-input-character;
2999     redo A;
3000     }
3001     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3002     ## NOTE: "CDATA section state" in the state is jointly implemented
3003     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3004     ## and |CDATA_SECTION_MSE2_STATE|.
3005 wakaba 1.10
3006     ## XML5: "CDATA state".
3007 wakaba 1.1
3008     if ($self->{nc} == 0x005D) { # ]
3009     !!!cp (221.1);
3010     $self->{state} = CDATA_SECTION_MSE1_STATE;
3011     !!!next-input-character;
3012     redo A;
3013     } elsif ($self->{nc} == -1) {
3014 wakaba 1.6 if ($self->{is_xml}) {
3015 wakaba 1.8 !!!cp (221.11);
3016 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3017 wakaba 1.8 } else {
3018     !!!cp (221.12);
3019 wakaba 1.6 }
3020    
3021 wakaba 1.1 $self->{state} = DATA_STATE;
3022 wakaba 1.5 $self->{s_kwd} = '';
3023 wakaba 1.10 ## Reconsume.
3024 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3025     !!!cp (221.2);
3026     !!!emit ($self->{ct}); # character
3027     } else {
3028     !!!cp (221.3);
3029     ## No token to emit. $self->{ct} is discarded.
3030     }
3031     redo A;
3032     } else {
3033     !!!cp (221.4);
3034     $self->{ct}->{data} .= chr $self->{nc};
3035     $self->{read_until}->($self->{ct}->{data},
3036     q<]>,
3037     length $self->{ct}->{data});
3038    
3039     ## Stay in the state.
3040     !!!next-input-character;
3041     redo A;
3042     }
3043    
3044     ## ISSUE: "text tokens" in spec.
3045     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3046 wakaba 1.10 ## XML5: "CDATA bracket state".
3047    
3048 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3049     !!!cp (221.5);
3050     $self->{state} = CDATA_SECTION_MSE2_STATE;
3051     !!!next-input-character;
3052     redo A;
3053     } else {
3054     !!!cp (221.6);
3055 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3056 wakaba 1.1 $self->{ct}->{data} .= ']';
3057 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3058 wakaba 1.1 ## Reconsume.
3059     redo A;
3060     }
3061     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3062 wakaba 1.10 ## XML5: "CDATA end state".
3063    
3064 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3065     $self->{state} = DATA_STATE;
3066 wakaba 1.5 $self->{s_kwd} = '';
3067 wakaba 1.1 !!!next-input-character;
3068     if (length $self->{ct}->{data}) { # character
3069     !!!cp (221.7);
3070     !!!emit ($self->{ct}); # character
3071     } else {
3072     !!!cp (221.8);
3073     ## No token to emit. $self->{ct} is discarded.
3074     }
3075     redo A;
3076     } elsif ($self->{nc} == 0x005D) { # ]
3077     !!!cp (221.9); # character
3078     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3079     ## Stay in the state.
3080     !!!next-input-character;
3081     redo A;
3082     } else {
3083     !!!cp (221.11);
3084     $self->{ct}->{data} .= ']]'; # character
3085     $self->{state} = CDATA_SECTION_STATE;
3086 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3087 wakaba 1.1 redo A;
3088     }
3089     } elsif ($self->{state} == ENTITY_STATE) {
3090     if ($is_space->{$self->{nc}} or
3091     {
3092     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3093     $self->{entity_add} => 1,
3094     }->{$self->{nc}}) {
3095 wakaba 1.22 if ($self->{is_xml}) {
3096     !!!cp (1001.1);
3097     !!!parse-error (type => 'bare ero',
3098     line => $self->{line_prev},
3099     column => $self->{column_prev}
3100     + ($self->{nc} == -1 ? 1 : 0));
3101     } else {
3102     !!!cp (1001);
3103     ## No error
3104     }
3105 wakaba 1.1 ## Don't consume
3106     ## Return nothing.
3107     #
3108     } elsif ($self->{nc} == 0x0023) { # #
3109     !!!cp (999);
3110     $self->{state} = ENTITY_HASH_STATE;
3111 wakaba 1.12 $self->{kwd} = '#';
3112 wakaba 1.1 !!!next-input-character;
3113     redo A;
3114 wakaba 1.22 } elsif ($self->{is_xml} or
3115     (0x0041 <= $self->{nc} and
3116 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3117     (0x0061 <= $self->{nc} and
3118     $self->{nc} <= 0x007A)) { # a..z
3119     !!!cp (998);
3120     require Whatpm::_NamedEntityList;
3121     $self->{state} = ENTITY_NAME_STATE;
3122 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3123     $self->{entity__value} = $self->{kwd};
3124 wakaba 1.1 $self->{entity__match} = 0;
3125     !!!next-input-character;
3126     redo A;
3127     } else {
3128     !!!cp (1027);
3129     !!!parse-error (type => 'bare ero');
3130     ## Return nothing.
3131     #
3132     }
3133    
3134     ## NOTE: No character is consumed by the "consume a character
3135     ## reference" algorithm. In other word, there is an "&" character
3136     ## that does not introduce a character reference, which would be
3137     ## appended to the parent element or the attribute value in later
3138     ## process of the tokenizer.
3139    
3140     if ($self->{prev_state} == DATA_STATE) {
3141     !!!cp (997);
3142     $self->{state} = $self->{prev_state};
3143 wakaba 1.5 $self->{s_kwd} = '';
3144 wakaba 1.1 ## Reconsume.
3145     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3146     line => $self->{line_prev},
3147     column => $self->{column_prev},
3148     });
3149     redo A;
3150     } else {
3151     !!!cp (996);
3152     $self->{ca}->{value} .= '&';
3153     $self->{state} = $self->{prev_state};
3154 wakaba 1.5 $self->{s_kwd} = '';
3155 wakaba 1.1 ## Reconsume.
3156     redo A;
3157     }
3158     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3159 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3160 wakaba 1.1 !!!cp (995);
3161     $self->{state} = HEXREF_X_STATE;
3162 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3163 wakaba 1.1 !!!next-input-character;
3164     redo A;
3165 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3166     !!!cp (995.1);
3167     if ($self->{is_xml}) {
3168     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3169     }
3170     $self->{state} = HEXREF_X_STATE;
3171     $self->{kwd} .= chr $self->{nc};
3172     !!!next-input-character;
3173     redo A;
3174 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3175     $self->{nc} <= 0x0039) { # 0..9
3176     !!!cp (994);
3177     $self->{state} = NCR_NUM_STATE;
3178 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3179 wakaba 1.1 !!!next-input-character;
3180     redo A;
3181     } else {
3182     !!!parse-error (type => 'bare nero',
3183     line => $self->{line_prev},
3184     column => $self->{column_prev} - 1);
3185    
3186     ## NOTE: According to the spec algorithm, nothing is returned,
3187     ## and then "&#" is appended to the parent element or the attribute
3188     ## value in the later processing.
3189    
3190     if ($self->{prev_state} == DATA_STATE) {
3191     !!!cp (1019);
3192     $self->{state} = $self->{prev_state};
3193 wakaba 1.5 $self->{s_kwd} = '';
3194 wakaba 1.1 ## Reconsume.
3195     !!!emit ({type => CHARACTER_TOKEN,
3196     data => '&#',
3197     line => $self->{line_prev},
3198     column => $self->{column_prev} - 1,
3199     });
3200     redo A;
3201     } else {
3202     !!!cp (993);
3203     $self->{ca}->{value} .= '&#';
3204     $self->{state} = $self->{prev_state};
3205 wakaba 1.5 $self->{s_kwd} = '';
3206 wakaba 1.1 ## Reconsume.
3207     redo A;
3208     }
3209     }
3210     } elsif ($self->{state} == NCR_NUM_STATE) {
3211     if (0x0030 <= $self->{nc} and
3212     $self->{nc} <= 0x0039) { # 0..9
3213     !!!cp (1012);
3214 wakaba 1.12 $self->{kwd} *= 10;
3215     $self->{kwd} += $self->{nc} - 0x0030;
3216 wakaba 1.1
3217     ## Stay in the state.
3218     !!!next-input-character;
3219     redo A;
3220     } elsif ($self->{nc} == 0x003B) { # ;
3221     !!!cp (1013);
3222     !!!next-input-character;
3223     #
3224     } else {
3225     !!!cp (1014);
3226     !!!parse-error (type => 'no refc');
3227     ## Reconsume.
3228     #
3229     }
3230    
3231 wakaba 1.12 my $code = $self->{kwd};
3232 wakaba 1.1 my $l = $self->{line_prev};
3233     my $c = $self->{column_prev};
3234 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3235     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3236     ($self->{is_xml} and $code == 0x0000)) {
3237 wakaba 1.1 !!!cp (1015);
3238     !!!parse-error (type => 'invalid character reference',
3239     text => (sprintf 'U+%04X', $code),
3240     line => $l, column => $c);
3241     $code = $charref_map->{$code};
3242     } elsif ($code > 0x10FFFF) {
3243     !!!cp (1016);
3244     !!!parse-error (type => 'invalid character reference',
3245     text => (sprintf 'U-%08X', $code),
3246     line => $l, column => $c);
3247     $code = 0xFFFD;
3248     }
3249    
3250     if ($self->{prev_state} == DATA_STATE) {
3251     !!!cp (992);
3252     $self->{state} = $self->{prev_state};
3253 wakaba 1.5 $self->{s_kwd} = '';
3254 wakaba 1.1 ## Reconsume.
3255     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3256 wakaba 1.7 has_reference => 1,
3257 wakaba 1.1 line => $l, column => $c,
3258     });
3259     redo A;
3260     } else {
3261     !!!cp (991);
3262     $self->{ca}->{value} .= chr $code;
3263     $self->{ca}->{has_reference} = 1;
3264     $self->{state} = $self->{prev_state};
3265 wakaba 1.5 $self->{s_kwd} = '';
3266 wakaba 1.1 ## Reconsume.
3267     redo A;
3268     }
3269     } elsif ($self->{state} == HEXREF_X_STATE) {
3270     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3271     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3272     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3273     # 0..9, A..F, a..f
3274     !!!cp (990);
3275     $self->{state} = HEXREF_HEX_STATE;
3276 wakaba 1.12 $self->{kwd} = 0;
3277 wakaba 1.1 ## Reconsume.
3278     redo A;
3279     } else {
3280     !!!parse-error (type => 'bare hcro',
3281     line => $self->{line_prev},
3282     column => $self->{column_prev} - 2);
3283    
3284     ## NOTE: According to the spec algorithm, nothing is returned,
3285     ## and then "&#" followed by "X" or "x" is appended to the parent
3286     ## element or the attribute value in the later processing.
3287    
3288     if ($self->{prev_state} == DATA_STATE) {
3289     !!!cp (1005);
3290     $self->{state} = $self->{prev_state};
3291 wakaba 1.5 $self->{s_kwd} = '';
3292 wakaba 1.1 ## Reconsume.
3293     !!!emit ({type => CHARACTER_TOKEN,
3294 wakaba 1.12 data => '&' . $self->{kwd},
3295 wakaba 1.1 line => $self->{line_prev},
3296 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3297 wakaba 1.1 });
3298     redo A;
3299     } else {
3300     !!!cp (989);
3301 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3302 wakaba 1.1 $self->{state} = $self->{prev_state};
3303 wakaba 1.5 $self->{s_kwd} = '';
3304 wakaba 1.1 ## Reconsume.
3305     redo A;
3306     }
3307     }
3308     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3309     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3310     # 0..9
3311     !!!cp (1002);
3312 wakaba 1.12 $self->{kwd} *= 0x10;
3313     $self->{kwd} += $self->{nc} - 0x0030;
3314 wakaba 1.1 ## Stay in the state.
3315     !!!next-input-character;
3316     redo A;
3317     } elsif (0x0061 <= $self->{nc} and
3318     $self->{nc} <= 0x0066) { # a..f
3319     !!!cp (1003);
3320 wakaba 1.12 $self->{kwd} *= 0x10;
3321     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3322 wakaba 1.1 ## Stay in the state.
3323     !!!next-input-character;
3324     redo A;
3325     } elsif (0x0041 <= $self->{nc} and
3326     $self->{nc} <= 0x0046) { # A..F
3327     !!!cp (1004);
3328 wakaba 1.12 $self->{kwd} *= 0x10;
3329     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3330 wakaba 1.1 ## Stay in the state.
3331     !!!next-input-character;
3332     redo A;
3333     } elsif ($self->{nc} == 0x003B) { # ;
3334     !!!cp (1006);
3335     !!!next-input-character;
3336     #
3337     } else {
3338     !!!cp (1007);
3339     !!!parse-error (type => 'no refc',
3340     line => $self->{line},
3341     column => $self->{column});
3342     ## Reconsume.
3343     #
3344     }
3345    
3346 wakaba 1.12 my $code = $self->{kwd};
3347 wakaba 1.1 my $l = $self->{line_prev};
3348     my $c = $self->{column_prev};
3349 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3350     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3351     ($self->{is_xml} and $code == 0x0000)) {
3352 wakaba 1.1 !!!cp (1008);
3353     !!!parse-error (type => 'invalid character reference',
3354     text => (sprintf 'U+%04X', $code),
3355     line => $l, column => $c);
3356     $code = $charref_map->{$code};
3357     } elsif ($code > 0x10FFFF) {
3358     !!!cp (1009);
3359     !!!parse-error (type => 'invalid character reference',
3360     text => (sprintf 'U-%08X', $code),
3361     line => $l, column => $c);
3362     $code = 0xFFFD;
3363     }
3364    
3365     if ($self->{prev_state} == DATA_STATE) {
3366     !!!cp (988);
3367     $self->{state} = $self->{prev_state};
3368 wakaba 1.5 $self->{s_kwd} = '';
3369 wakaba 1.1 ## Reconsume.
3370     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3371 wakaba 1.7 has_reference => 1,
3372 wakaba 1.1 line => $l, column => $c,
3373     });
3374     redo A;
3375     } else {
3376     !!!cp (987);
3377     $self->{ca}->{value} .= chr $code;
3378     $self->{ca}->{has_reference} = 1;
3379     $self->{state} = $self->{prev_state};
3380 wakaba 1.5 $self->{s_kwd} = '';
3381 wakaba 1.1 ## Reconsume.
3382     redo A;
3383     }
3384     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3385 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3386     $self->{nc} <= 0x005A) or # x
3387     (0x0061 <= $self->{nc} and # a
3388     $self->{nc} <= 0x007A) or # z
3389     (0x0030 <= $self->{nc} and # 0
3390     $self->{nc} <= 0x0039) or # 9
3391 wakaba 1.22 $self->{nc} == 0x003B or # ;
3392     ($self->{is_xml} and
3393     not ($is_space->{$self->{nc}} or
3394     {
3395     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3396     $self->{entity_add} => 1,
3397     }->{$self->{nc}}))) {
3398 wakaba 1.1 our $EntityChar;
3399 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3400 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3401     $self->{ge}->{$self->{kwd}}) {
3402 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3403 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3404     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3405     !!!cp (1020.1);
3406     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3407     } else {
3408     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3409     !!!cp (1020.2);
3410     !!!parse-error (type => 'unparsed entity', ## TODO: type
3411     value => $self->{kwd});
3412     } else {
3413     !!!cp (1020.3);
3414     }
3415     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3416     }
3417     } else {
3418     if ($self->{is_xml}) {
3419     !!!cp (1020.4);
3420     !!!parse-error (type => 'entity not declared', ## TODO: type
3421     value => $self->{kwd},
3422     level => {
3423     'amp;' => $self->{level}->{warn},
3424     'quot;' => $self->{level}->{warn},
3425     'lt;' => $self->{level}->{warn},
3426     'gt;' => $self->{level}->{warn},
3427     'apos;' => $self->{level}->{warn},
3428     }->{$self->{kwd}} ||
3429     $self->{level}->{must});
3430     } else {
3431     !!!cp (1020);
3432     }
3433     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3434     }
3435 wakaba 1.1 $self->{entity__match} = 1;
3436     !!!next-input-character;
3437     #
3438     } else {
3439     !!!cp (1021);
3440 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3441 wakaba 1.1 $self->{entity__match} = -1;
3442     ## Stay in the state.
3443     !!!next-input-character;
3444     redo A;
3445     }
3446     } else {
3447     !!!cp (1022);
3448     $self->{entity__value} .= chr $self->{nc};
3449     $self->{entity__match} *= 2;
3450     ## Stay in the state.
3451     !!!next-input-character;
3452     redo A;
3453     }
3454     }
3455    
3456     my $data;
3457     my $has_ref;
3458     if ($self->{entity__match} > 0) {
3459     !!!cp (1023);
3460     $data = $self->{entity__value};
3461     $has_ref = 1;
3462     #
3463     } elsif ($self->{entity__match} < 0) {
3464     !!!parse-error (type => 'no refc');
3465     if ($self->{prev_state} != DATA_STATE and # in attribute
3466     $self->{entity__match} < -1) {
3467     !!!cp (1024);
3468 wakaba 1.12 $data = '&' . $self->{kwd};
3469 wakaba 1.1 #
3470     } else {
3471     !!!cp (1025);
3472     $data = $self->{entity__value};
3473     $has_ref = 1;
3474     #
3475     }
3476     } else {
3477     !!!cp (1026);
3478     !!!parse-error (type => 'bare ero',
3479     line => $self->{line_prev},
3480 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3481     $data = '&' . $self->{kwd};
3482 wakaba 1.1 #
3483     }
3484    
3485     ## NOTE: In these cases, when a character reference is found,
3486     ## it is consumed and a character token is returned, or, otherwise,
3487     ## nothing is consumed and returned, according to the spec algorithm.
3488     ## In this implementation, anything that has been examined by the
3489     ## tokenizer is appended to the parent element or the attribute value
3490     ## as string, either literal string when no character reference or
3491     ## entity-replaced string otherwise, in this stage, since any characters
3492     ## that would not be consumed are appended in the data state or in an
3493     ## appropriate attribute value state anyway.
3494    
3495     if ($self->{prev_state} == DATA_STATE) {
3496     !!!cp (986);
3497     $self->{state} = $self->{prev_state};
3498 wakaba 1.5 $self->{s_kwd} = '';
3499 wakaba 1.1 ## Reconsume.
3500     !!!emit ({type => CHARACTER_TOKEN,
3501     data => $data,
3502 wakaba 1.7 has_reference => $has_ref,
3503 wakaba 1.1 line => $self->{line_prev},
3504 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3505 wakaba 1.1 });
3506     redo A;
3507     } else {
3508     !!!cp (985);
3509     $self->{ca}->{value} .= $data;
3510     $self->{ca}->{has_reference} = 1 if $has_ref;
3511     $self->{state} = $self->{prev_state};
3512 wakaba 1.5 $self->{s_kwd} = '';
3513 wakaba 1.1 ## Reconsume.
3514     redo A;
3515     }
3516 wakaba 1.8
3517     ## XML-only states
3518    
3519     } elsif ($self->{state} == PI_STATE) {
3520 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3521    
3522 wakaba 1.8 if ($is_space->{$self->{nc}} or
3523 wakaba 1.14 $self->{nc} == 0x003F or # ?
3524 wakaba 1.8 $self->{nc} == -1) {
3525 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3526     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3527     ## "DOCTYPE pi state": Parse error, switch to the "data
3528     ## state".
3529 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3530     line => $self->{line_prev},
3531     column => $self->{column_prev}
3532     - 1 * ($self->{nc} != -1));
3533     $self->{state} = BOGUS_COMMENT_STATE;
3534     ## Reconsume.
3535     $self->{ct} = {type => COMMENT_TOKEN,
3536     data => '?',
3537     line => $self->{line_prev},
3538     column => $self->{column_prev}
3539     - 1 * ($self->{nc} != -1),
3540     };
3541     redo A;
3542     } else {
3543 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3544 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3545     target => chr $self->{nc},
3546     data => '',
3547     line => $self->{line_prev},
3548     column => $self->{column_prev} - 1,
3549     };
3550     $self->{state} = PI_TARGET_STATE;
3551     !!!next-input-character;
3552     redo A;
3553     }
3554     } elsif ($self->{state} == PI_TARGET_STATE) {
3555     if ($is_space->{$self->{nc}}) {
3556     $self->{state} = PI_TARGET_AFTER_STATE;
3557     !!!next-input-character;
3558     redo A;
3559     } elsif ($self->{nc} == -1) {
3560     !!!parse-error (type => 'no pic'); ## TODO: type
3561 wakaba 1.13 if ($self->{in_subset}) {
3562     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3563     } else {
3564     $self->{state} = DATA_STATE;
3565     $self->{s_kwd} = '';
3566     }
3567 wakaba 1.8 ## Reconsume.
3568     !!!emit ($self->{ct}); # pi
3569     redo A;
3570     } elsif ($self->{nc} == 0x003F) { # ?
3571     $self->{state} = PI_AFTER_STATE;
3572     !!!next-input-character;
3573     redo A;
3574     } else {
3575     ## XML5: typo ("tag name" -> "target")
3576     $self->{ct}->{target} .= chr $self->{nc}; # pi
3577     !!!next-input-character;
3578     redo A;
3579     }
3580     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3581     if ($is_space->{$self->{nc}}) {
3582     ## Stay in the state.
3583     !!!next-input-character;
3584     redo A;
3585     } else {
3586     $self->{state} = PI_DATA_STATE;
3587     ## Reprocess.
3588     redo A;
3589     }
3590     } elsif ($self->{state} == PI_DATA_STATE) {
3591     if ($self->{nc} == 0x003F) { # ?
3592     $self->{state} = PI_DATA_AFTER_STATE;
3593     !!!next-input-character;
3594     redo A;
3595     } elsif ($self->{nc} == -1) {
3596     !!!parse-error (type => 'no pic'); ## TODO: type
3597 wakaba 1.13 if ($self->{in_subset}) {
3598 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3599 wakaba 1.13 } else {
3600     $self->{state} = DATA_STATE;
3601     $self->{s_kwd} = '';
3602     }
3603 wakaba 1.8 ## Reprocess.
3604     !!!emit ($self->{ct}); # pi
3605     redo A;
3606     } else {
3607     $self->{ct}->{data} .= chr $self->{nc}; # pi
3608     $self->{read_until}->($self->{ct}->{data}, q[?],
3609     length $self->{ct}->{data});
3610     ## Stay in the state.
3611     !!!next-input-character;
3612     ## Reprocess.
3613     redo A;
3614     }
3615     } elsif ($self->{state} == PI_AFTER_STATE) {
3616 wakaba 1.14 ## XML5: Part of "Pi after state".
3617    
3618 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3619 wakaba 1.13 if ($self->{in_subset}) {
3620     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3621     } else {
3622     $self->{state} = DATA_STATE;
3623     $self->{s_kwd} = '';
3624     }
3625 wakaba 1.8 !!!next-input-character;
3626     !!!emit ($self->{ct}); # pi
3627     redo A;
3628     } elsif ($self->{nc} == 0x003F) { # ?
3629     !!!parse-error (type => 'no s after target', ## TODO: type
3630     line => $self->{line_prev},
3631     column => $self->{column_prev}); ## XML5: no error
3632     $self->{ct}->{data} .= '?';
3633     $self->{state} = PI_DATA_AFTER_STATE;
3634     !!!next-input-character;
3635     redo A;
3636     } else {
3637     !!!parse-error (type => 'no s after target', ## TODO: type
3638     line => $self->{line_prev},
3639     column => $self->{column_prev}
3640     + 1 * ($self->{nc} == -1)); ## XML5: no error
3641     $self->{ct}->{data} .= '?'; ## XML5: not appended
3642     $self->{state} = PI_DATA_STATE;
3643     ## Reprocess.
3644     redo A;
3645     }
3646     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3647 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3648    
3649 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3650 wakaba 1.13 if ($self->{in_subset}) {
3651     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3652     } else {
3653     $self->{state} = DATA_STATE;
3654     $self->{s_kwd} = '';
3655     }
3656 wakaba 1.8 !!!next-input-character;
3657     !!!emit ($self->{ct}); # pi
3658     redo A;
3659     } elsif ($self->{nc} == 0x003F) { # ?
3660     $self->{ct}->{data} .= '?';
3661     ## Stay in the state.
3662     !!!next-input-character;
3663     redo A;
3664     } else {
3665     $self->{ct}->{data} .= '?'; ## XML5: not appended
3666     $self->{state} = PI_DATA_STATE;
3667     ## Reprocess.
3668     redo A;
3669     }
3670 wakaba 1.12
3671     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3672     if ($self->{nc} == 0x003C) { # <
3673 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3674 wakaba 1.12 !!!next-input-character;
3675     redo A;
3676     } elsif ($self->{nc} == 0x0025) { # %
3677     ## XML5: Not defined yet.
3678    
3679     ## TODO:
3680 wakaba 1.24
3681     if (not $self->{stop_processing} and
3682     not $self->{document}->xml_standalone) {
3683     !!!parse-error (type => 'stop processing', ## TODO: type
3684     level => $self->{level}->{info});
3685     $self->{stop_processing} = 1;
3686     }
3687    
3688 wakaba 1.12 !!!next-input-character;
3689     redo A;
3690     } elsif ($self->{nc} == 0x005D) { # ]
3691 wakaba 1.13 delete $self->{in_subset};
3692 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3693     !!!next-input-character;
3694     redo A;
3695     } elsif ($is_space->{$self->{nc}}) {
3696     ## Stay in the state.
3697     !!!next-input-character;
3698     redo A;
3699     } elsif ($self->{nc} == -1) {
3700     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3701 wakaba 1.13 delete $self->{in_subset};
3702 wakaba 1.12 $self->{state} = DATA_STATE;
3703     $self->{s_kwd} = '';
3704     ## Reconsume.
3705 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3706 wakaba 1.12 redo A;
3707     } else {
3708     unless ($self->{internal_subset_tainted}) {
3709     ## XML5: No parse error.
3710     !!!parse-error (type => 'string in internal subset');
3711     $self->{internal_subset_tainted} = 1;
3712     }
3713     ## Stay in the state.
3714     !!!next-input-character;
3715     redo A;
3716     }
3717     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3718     if ($self->{nc} == 0x003E) { # >
3719     $self->{state} = DATA_STATE;
3720     $self->{s_kwd} = '';
3721     !!!next-input-character;
3722 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3723 wakaba 1.12 redo A;
3724     } elsif ($self->{nc} == -1) {
3725     !!!parse-error (type => 'unclosed DOCTYPE');
3726     $self->{state} = DATA_STATE;
3727     $self->{s_kwd} = '';
3728     ## Reconsume.
3729 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3730 wakaba 1.12 redo A;
3731     } else {
3732     ## XML5: No parse error and stay in the state.
3733     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3734    
3735 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3736     !!!next-input-character;
3737     redo A;
3738     }
3739     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3740     if ($self->{nc} == 0x003E) { # >
3741     $self->{state} = DATA_STATE;
3742     $self->{s_kwd} = '';
3743     !!!next-input-character;
3744     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3745     redo A;
3746     } elsif ($self->{nc} == -1) {
3747     $self->{state} = DATA_STATE;
3748     $self->{s_kwd} = '';
3749     ## Reconsume.
3750     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3751     redo A;
3752     } else {
3753     ## Stay in the state.
3754     !!!next-input-character;
3755     redo A;
3756     }
3757     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3758     if ($self->{nc} == 0x0021) { # !
3759 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3760 wakaba 1.13 !!!next-input-character;
3761     redo A;
3762     } elsif ($self->{nc} == 0x003F) { # ?
3763     $self->{state} = PI_STATE;
3764     !!!next-input-character;
3765     redo A;
3766     } elsif ($self->{nc} == -1) {
3767     !!!parse-error (type => 'bare stago');
3768     $self->{state} = DATA_STATE;
3769     $self->{s_kwd} = '';
3770     ## Reconsume.
3771     redo A;
3772     } else {
3773     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3774     line => $self->{line_prev},
3775     column => $self->{column_prev});
3776     $self->{state} = BOGUS_COMMENT_STATE;
3777     $self->{ct} = {type => COMMENT_TOKEN,
3778     data => '',
3779     }; ## NOTE: Will be discarded.
3780 wakaba 1.12 !!!next-input-character;
3781     redo A;
3782     }
3783 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3784     ## XML5: "DOCTYPE markup declaration state".
3785    
3786     if ($self->{nc} == 0x002D) { # -
3787     $self->{state} = MD_HYPHEN_STATE;
3788     !!!next-input-character;
3789     redo A;
3790 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3791     $self->{nc} == 0x0065) { # e
3792 wakaba 1.14 $self->{state} = MD_E_STATE;
3793     $self->{kwd} = chr $self->{nc};
3794     !!!next-input-character;
3795     redo A;
3796 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3797     $self->{nc} == 0x0061) { # a
3798 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3799     $self->{kwd} = chr $self->{nc};
3800     !!!next-input-character;
3801     redo A;
3802 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3803     $self->{nc} == 0x006E) { # n
3804 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3805     $self->{kwd} = chr $self->{nc};
3806     !!!next-input-character;
3807     redo A;
3808     } else {
3809     #
3810     }
3811    
3812     ## XML5: No parse error.
3813     !!!parse-error (type => 'bogus comment',
3814     line => $self->{line_prev},
3815     column => $self->{column_prev} - 1);
3816     ## Reconsume.
3817     $self->{state} = BOGUS_COMMENT_STATE;
3818     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3819     redo A;
3820     } elsif ($self->{state} == MD_E_STATE) {
3821 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3822     $self->{nc} == 0x006E) { # n
3823 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3824     $self->{kwd} .= chr $self->{nc};
3825     !!!next-input-character;
3826     redo A;
3827 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3828     $self->{nc} == 0x006C) { # l
3829 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3830     $self->{state} = MD_ELEMENT_STATE;
3831     $self->{kwd} .= chr $self->{nc};
3832     !!!next-input-character;
3833     redo A;
3834     } else {
3835     ## XML5: No parse error.
3836     !!!parse-error (type => 'bogus comment',
3837     line => $self->{line_prev},
3838     column => $self->{column_prev} - 2
3839     + 1 * ($self->{nc} == -1));
3840     ## Reconsume.
3841     $self->{state} = BOGUS_COMMENT_STATE;
3842     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3843     redo A;
3844     }
3845     } elsif ($self->{state} == MD_ENTITY_STATE) {
3846 wakaba 1.17 if ($self->{nc} == [
3847     undef,
3848     undef,
3849     0x0054, # T
3850     0x0049, # I
3851     0x0054, # T
3852     ]->[length $self->{kwd}] or
3853     $self->{nc} == [
3854     undef,
3855     undef,
3856     0x0074, # t
3857     0x0069, # i
3858     0x0074, # t
3859     ]->[length $self->{kwd}]) {
3860 wakaba 1.14 ## Stay in the state.
3861     $self->{kwd} .= chr $self->{nc};
3862     !!!next-input-character;
3863     redo A;
3864 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3865     ($self->{nc} == 0x0059 or # Y
3866     $self->{nc} == 0x0079)) { # y
3867     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3868     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3869     text => 'ENTITY',
3870     line => $self->{line_prev},
3871     column => $self->{column_prev} - 4);
3872     }
3873     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3874 wakaba 1.14 line => $self->{line_prev},
3875     column => $self->{column_prev} - 6};
3876     $self->{state} = DOCTYPE_MD_STATE;
3877     !!!next-input-character;
3878     redo A;
3879     } else {
3880     !!!parse-error (type => 'bogus comment',
3881     line => $self->{line_prev},
3882     column => $self->{column_prev} - 1
3883     - (length $self->{kwd})
3884     + 1 * ($self->{nc} == -1));
3885     $self->{state} = BOGUS_COMMENT_STATE;
3886     ## Reconsume.
3887     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3888     redo A;
3889     }
3890     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3891 wakaba 1.17 if ($self->{nc} == [
3892     undef,
3893     undef,
3894     0x0045, # E
3895     0x004D, # M
3896     0x0045, # E
3897     0x004E, # N
3898     ]->[length $self->{kwd}] or
3899     $self->{nc} == [
3900     undef,
3901     undef,
3902     0x0065, # e
3903     0x006D, # m
3904     0x0065, # e
3905     0x006E, # n
3906     ]->[length $self->{kwd}]) {
3907 wakaba 1.14 ## Stay in the state.
3908     $self->{kwd} .= chr $self->{nc};
3909     !!!next-input-character;
3910     redo A;
3911 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3912     ($self->{nc} == 0x0054 or # T
3913     $self->{nc} == 0x0074)) { # t
3914     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3915     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3916     text => 'ELEMENT',
3917     line => $self->{line_prev},
3918     column => $self->{column_prev} - 5);
3919     }
3920 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3921     line => $self->{line_prev},
3922 wakaba 1.23 column => $self->{column_prev} - 7};
3923 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3924     !!!next-input-character;
3925     redo A;
3926     } else {
3927     !!!parse-error (type => 'bogus comment',
3928     line => $self->{line_prev},
3929     column => $self->{column_prev} - 1
3930     - (length $self->{kwd})
3931     + 1 * ($self->{nc} == -1));
3932     $self->{state} = BOGUS_COMMENT_STATE;
3933     ## Reconsume.
3934     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3935     redo A;
3936     }
3937     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3938 wakaba 1.17 if ($self->{nc} == [
3939     undef,
3940     0x0054, # T
3941     0x0054, # T
3942     0x004C, # L
3943     0x0049, # I
3944     0x0053, # S
3945     ]->[length $self->{kwd}] or
3946     $self->{nc} == [
3947     undef,
3948     0x0074, # t
3949     0x0074, # t
3950     0x006C, # l
3951     0x0069, # i
3952     0x0073, # s
3953     ]->[length $self->{kwd}]) {
3954 wakaba 1.14 ## Stay in the state.
3955     $self->{kwd} .= chr $self->{nc};
3956     !!!next-input-character;
3957     redo A;
3958 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3959     ($self->{nc} == 0x0054 or # T
3960     $self->{nc} == 0x0074)) { # t
3961     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3962     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3963     text => 'ATTLIST',
3964     line => $self->{line_prev},
3965     column => $self->{column_prev} - 5);
3966     }
3967 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3968 wakaba 1.15 attrdefs => [],
3969 wakaba 1.14 line => $self->{line_prev},
3970 wakaba 1.23 column => $self->{column_prev} - 7};
3971 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3972     !!!next-input-character;
3973     redo A;
3974     } else {
3975     !!!parse-error (type => 'bogus comment',
3976     line => $self->{line_prev},
3977     column => $self->{column_prev} - 1
3978     - (length $self->{kwd})
3979     + 1 * ($self->{nc} == -1));
3980     $self->{state} = BOGUS_COMMENT_STATE;
3981     ## Reconsume.
3982     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3983     redo A;
3984     }
3985     } elsif ($self->{state} == MD_NOTATION_STATE) {
3986 wakaba 1.17 if ($self->{nc} == [
3987     undef,
3988     0x004F, # O
3989     0x0054, # T
3990     0x0041, # A
3991     0x0054, # T
3992     0x0049, # I
3993     0x004F, # O
3994     ]->[length $self->{kwd}] or
3995     $self->{nc} == [
3996     undef,
3997     0x006F, # o
3998     0x0074, # t
3999     0x0061, # a
4000     0x0074, # t
4001     0x0069, # i
4002     0x006F, # o
4003     ]->[length $self->{kwd}]) {
4004 wakaba 1.14 ## Stay in the state.
4005     $self->{kwd} .= chr $self->{nc};
4006     !!!next-input-character;
4007     redo A;
4008 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4009     ($self->{nc} == 0x004E or # N
4010     $self->{nc} == 0x006E)) { # n
4011     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4012     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4013     text => 'NOTATION',
4014     line => $self->{line_prev},
4015     column => $self->{column_prev} - 6);
4016     }
4017 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4018     line => $self->{line_prev},
4019 wakaba 1.23 column => $self->{column_prev} - 8};
4020 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4021     !!!next-input-character;
4022     redo A;
4023     } else {
4024     !!!parse-error (type => 'bogus comment',
4025     line => $self->{line_prev},
4026     column => $self->{column_prev} - 1
4027     - (length $self->{kwd})
4028     + 1 * ($self->{nc} == -1));
4029     $self->{state} = BOGUS_COMMENT_STATE;
4030     ## Reconsume.
4031     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4032     redo A;
4033     }
4034     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4035     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4036     ## "DOCTYPE NOTATION state".
4037    
4038     if ($is_space->{$self->{nc}}) {
4039     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4040     $self->{state} = BEFORE_MD_NAME_STATE;
4041     !!!next-input-character;
4042     redo A;
4043     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4044     $self->{nc} == 0x0025) { # %
4045     ## XML5: Switch to the "DOCTYPE bogus comment state".
4046     !!!parse-error (type => 'no space before md name'); ## TODO: type
4047     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4048     !!!next-input-character;
4049     redo A;
4050     } elsif ($self->{nc} == -1) {
4051     !!!parse-error (type => 'unclosed md'); ## TODO: type
4052     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4053     ## Reconsume.
4054     redo A;
4055     } elsif ($self->{nc} == 0x003E) { # >
4056     ## XML5: Switch to the "DOCTYPE bogus comment state".
4057     !!!parse-error (type => 'no md name'); ## TODO: type
4058     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4059     !!!next-input-character;
4060     redo A;
4061     } else {
4062     ## XML5: Switch to the "DOCTYPE bogus comment state".
4063     !!!parse-error (type => 'no space before md name'); ## TODO: type
4064     $self->{state} = BEFORE_MD_NAME_STATE;
4065     redo A;
4066     }
4067     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4068     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4069     ## before state", "DOCTYPE ATTLIST name before state".
4070    
4071     if ($is_space->{$self->{nc}}) {
4072     ## Stay in the state.
4073     !!!next-input-character;
4074     redo A;
4075     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4076     $self->{nc} == 0x0025) { # %
4077     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4078     !!!next-input-character;
4079     redo A;
4080     } elsif ($self->{nc} == 0x003E) { # >
4081     ## XML5: Same as "Anything else".
4082     !!!parse-error (type => 'no md name'); ## TODO: type
4083     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4084     !!!next-input-character;
4085     redo A;
4086     } elsif ($self->{nc} == -1) {
4087     !!!parse-error (type => 'unclosed md'); ## TODO: type
4088     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4089     ## Reconsume.
4090     redo A;
4091     } else {
4092     ## XML5: [ATTLIST] Not defined yet.
4093     $self->{ct}->{name} .= chr $self->{nc};
4094     $self->{state} = MD_NAME_STATE;
4095     !!!next-input-character;
4096     redo A;
4097     }
4098     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4099     if ($is_space->{$self->{nc}}) {
4100     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4101     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4102     $self->{state} = BEFORE_MD_NAME_STATE;
4103     !!!next-input-character;
4104     redo A;
4105     } elsif ($self->{nc} == 0x003E) { # >
4106     ## XML5: Same as "Anything else".
4107     !!!parse-error (type => 'no md name'); ## TODO: type
4108     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4109     !!!next-input-character;
4110     redo A;
4111     } elsif ($self->{nc} == -1) {
4112     !!!parse-error (type => 'unclosed md');
4113     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4114     ## Reconsume.
4115     redo A;
4116     } else {
4117     ## XML5: No parse error.
4118     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4119     $self->{state} = BOGUS_COMMENT_STATE;
4120     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4121     ## Reconsume.
4122     redo A;
4123     }
4124     } elsif ($self->{state} == MD_NAME_STATE) {
4125     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4126    
4127     if ($is_space->{$self->{nc}}) {
4128 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4129     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4130     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4131 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4132 wakaba 1.16 } else { # ENTITY/NOTATION
4133     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4134     }
4135 wakaba 1.14 !!!next-input-character;
4136     redo A;
4137     } elsif ($self->{nc} == 0x003E) { # >
4138     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4139     #
4140     } else {
4141 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4142 wakaba 1.14 }
4143     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4144     !!!next-input-character;
4145     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4146     redo A;
4147     } elsif ($self->{nc} == -1) {
4148     ## XML5: [ATTLIST] No parse error.
4149     !!!parse-error (type => 'unclosed md');
4150     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4151     ## Reconsume.
4152     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4153     redo A;
4154     } else {
4155     ## XML5: [ATTLIST] Not defined yet.
4156     $self->{ct}->{name} .= chr $self->{nc};
4157     ## Stay in the state.
4158     !!!next-input-character;
4159     redo A;
4160     }
4161     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4162     if ($is_space->{$self->{nc}}) {
4163     ## Stay in the state.
4164     !!!next-input-character;
4165     redo A;
4166     } elsif ($self->{nc} == 0x003E) { # >
4167     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4168     !!!next-input-character;
4169     !!!emit ($self->{ct}); # ATTLIST
4170     redo A;
4171     } elsif ($self->{nc} == -1) {
4172     ## XML5: No parse error.
4173     !!!parse-error (type => 'unclosed md'); ## TODO: type
4174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4175 wakaba 1.15 !!!emit ($self->{ct});
4176     redo A;
4177     } else {
4178     ## XML5: Not defined yet.
4179     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4180     tokens => [],
4181     line => $self->{line}, column => $self->{column}};
4182     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4183     !!!next-input-character;
4184     redo A;
4185     }
4186     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4187     if ($is_space->{$self->{nc}}) {
4188     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4189     !!!next-input-character;
4190     redo A;
4191     } elsif ($self->{nc} == 0x003E) { # >
4192     ## XML5: Same as "anything else".
4193     !!!parse-error (type => 'no attr type'); ## TODO: type
4194     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4195     !!!next-input-character;
4196     !!!emit ($self->{ct}); # ATTLIST
4197     redo A;
4198     } elsif ($self->{nc} == 0x0028) { # (
4199     ## XML5: Same as "anything else".
4200     !!!parse-error (type => 'no space before paren'); ## TODO: type
4201     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4202     !!!next-input-character;
4203     redo A;
4204     } elsif ($self->{nc} == -1) {
4205     ## XML5: No parse error.
4206     !!!parse-error (type => 'unclosed md'); ## TODO: type
4207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4208     !!!next-input-character;
4209     !!!emit ($self->{ct}); # ATTLIST
4210     redo A;
4211     } else {
4212     ## XML5: Not defined yet.
4213     $self->{ca}->{name} .= chr $self->{nc};
4214     ## Stay in the state.
4215     !!!next-input-character;
4216     redo A;
4217     }
4218     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4219     if ($is_space->{$self->{nc}}) {
4220     ## Stay in the state.
4221     !!!next-input-character;
4222     redo A;
4223     } elsif ($self->{nc} == 0x003E) { # >
4224     ## XML5: Same as "anything else".
4225     !!!parse-error (type => 'no attr type'); ## TODO: type
4226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4227     !!!next-input-character;
4228     !!!emit ($self->{ct}); # ATTLIST
4229     redo A;
4230     } elsif ($self->{nc} == 0x0028) { # (
4231     ## XML5: Same as "anything else".
4232     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4233     !!!next-input-character;
4234     redo A;
4235     } elsif ($self->{nc} == -1) {
4236     ## XML5: No parse error.
4237     !!!parse-error (type => 'unclosed md'); ## TODO: type
4238     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4239     !!!next-input-character;
4240     !!!emit ($self->{ct});
4241 wakaba 1.14 redo A;
4242     } else {
4243     ## XML5: Not defined yet.
4244 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4245     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4246     !!!next-input-character;
4247     redo A;
4248     }
4249     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4250     if ($is_space->{$self->{nc}}) {
4251     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4252     !!!next-input-character;
4253     redo A;
4254     } elsif ($self->{nc} == 0x0023) { # #
4255     ## XML5: Same as "anything else".
4256     !!!parse-error (type => 'no space before default value'); ## TODO: type
4257     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4258     !!!next-input-character;
4259     redo A;
4260     } elsif ($self->{nc} == 0x0022) { # "
4261     ## XML5: Same as "anything else".
4262     !!!parse-error (type => 'no space before default value'); ## TODO: type
4263     $self->{ca}->{value} = '';
4264     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4265     !!!next-input-character;
4266     redo A;
4267     } elsif ($self->{nc} == 0x0027) { # '
4268     ## XML5: Same as "anything else".
4269     !!!parse-error (type => 'no space before default value'); ## TODO: type
4270     $self->{ca}->{value} = '';
4271     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4272     !!!next-input-character;
4273     redo A;
4274     } elsif ($self->{nc} == 0x003E) { # >
4275     ## XML5: Same as "anything else".
4276     !!!parse-error (type => 'no attr default'); ## TODO: type
4277     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4278     !!!next-input-character;
4279     !!!emit ($self->{ct}); # ATTLIST
4280     redo A;
4281     } elsif ($self->{nc} == 0x0028) { # (
4282     ## XML5: Same as "anything else".
4283     !!!parse-error (type => 'no space before paren'); ## TODO: type
4284     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4285     !!!next-input-character;
4286     redo A;
4287     } elsif ($self->{nc} == -1) {
4288     ## XML5: No parse error.
4289     !!!parse-error (type => 'unclosed md'); ## TODO: type
4290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4291     !!!next-input-character;
4292     !!!emit ($self->{ct});
4293     redo A;
4294     } else {
4295     ## XML5: Not defined yet.
4296     $self->{ca}->{type} .= chr $self->{nc};
4297     ## Stay in the state.
4298     !!!next-input-character;
4299     redo A;
4300     }
4301     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4302     if ($is_space->{$self->{nc}}) {
4303     ## Stay in the state.
4304     !!!next-input-character;
4305     redo A;
4306     } elsif ($self->{nc} == 0x0028) { # (
4307     ## XML5: Same as "anything else".
4308     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4309     !!!next-input-character;
4310     redo A;
4311     } elsif ($self->{nc} == 0x0023) { # #
4312     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4313     !!!next-input-character;
4314     redo A;
4315     } elsif ($self->{nc} == 0x0022) { # "
4316     ## XML5: Same as "anything else".
4317     $self->{ca}->{value} = '';
4318     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4319     !!!next-input-character;
4320     redo A;
4321     } elsif ($self->{nc} == 0x0027) { # '
4322     ## XML5: Same as "anything else".
4323     $self->{ca}->{value} = '';
4324     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4325     !!!next-input-character;
4326     redo A;
4327     } elsif ($self->{nc} == 0x003E) { # >
4328     ## XML5: Same as "anything else".
4329     !!!parse-error (type => 'no attr default'); ## TODO: type
4330     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4331     !!!next-input-character;
4332     !!!emit ($self->{ct}); # ATTLIST
4333     redo A;
4334     } elsif ($self->{nc} == -1) {
4335     ## XML5: No parse error.
4336     !!!parse-error (type => 'unclosed md'); ## TODO: type
4337     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4338     !!!next-input-character;
4339     !!!emit ($self->{ct});
4340     redo A;
4341     } else {
4342     ## XML5: Switch to the "DOCTYPE bogus comment state".
4343     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4344     $self->{ca}->{value} = '';
4345     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4346     ## Reconsume.
4347     redo A;
4348     }
4349     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4350     if ($is_space->{$self->{nc}}) {
4351     ## Stay in the state.
4352     !!!next-input-character;
4353     redo A;
4354     } elsif ($self->{nc} == 0x007C) { # |
4355     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4356     ## Stay in the state.
4357     !!!next-input-character;
4358     redo A;
4359     } elsif ($self->{nc} == 0x0029) { # )
4360     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4361     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4362     !!!next-input-character;
4363     redo A;
4364     } elsif ($self->{nc} == 0x003E) { # >
4365     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367     !!!next-input-character;
4368     !!!emit ($self->{ct}); # ATTLIST
4369     redo A;
4370     } elsif ($self->{nc} == -1) {
4371     ## XML5: No parse error.
4372     !!!parse-error (type => 'unclosed md'); ## TODO: type
4373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4374     !!!next-input-character;
4375     !!!emit ($self->{ct});
4376     redo A;
4377     } else {
4378     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4379     $self->{state} = ALLOWED_TOKEN_STATE;
4380     !!!next-input-character;
4381     redo A;
4382     }
4383     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4384     if ($is_space->{$self->{nc}}) {
4385     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4386     !!!next-input-character;
4387     redo A;
4388     } elsif ($self->{nc} == 0x007C) { # |
4389     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4390     !!!next-input-character;
4391     redo A;
4392     } elsif ($self->{nc} == 0x0029) { # )
4393     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4394     !!!next-input-character;
4395     redo A;
4396     } elsif ($self->{nc} == 0x003E) { # >
4397     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4399     !!!next-input-character;
4400     !!!emit ($self->{ct}); # ATTLIST
4401     redo A;
4402     } elsif ($self->{nc} == -1) {
4403     ## XML5: No parse error.
4404     !!!parse-error (type => 'unclosed md'); ## TODO: type
4405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4406     !!!next-input-character;
4407     !!!emit ($self->{ct});
4408     redo A;
4409     } else {
4410     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4411     ## Stay in the state.
4412     !!!next-input-character;
4413     redo A;
4414     }
4415     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4416     if ($is_space->{$self->{nc}}) {
4417     ## Stay in the state.
4418     !!!next-input-character;
4419     redo A;
4420     } elsif ($self->{nc} == 0x007C) { # |
4421     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4422     !!!next-input-character;
4423     redo A;
4424     } elsif ($self->{nc} == 0x0029) { # )
4425     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4426     !!!next-input-character;
4427     redo A;
4428     } elsif ($self->{nc} == 0x003E) { # >
4429     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4430     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4431     !!!next-input-character;
4432     !!!emit ($self->{ct}); # ATTLIST
4433     redo A;
4434     } elsif ($self->{nc} == -1) {
4435     ## XML5: No parse error.
4436     !!!parse-error (type => 'unclosed md'); ## TODO: type
4437     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4438     !!!next-input-character;
4439     !!!emit ($self->{ct});
4440     redo A;
4441     } else {
4442     !!!parse-error (type => 'space in allowed token', ## TODO: type
4443     line => $self->{line_prev},
4444     column => $self->{column_prev});
4445     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4446     $self->{state} = ALLOWED_TOKEN_STATE;
4447     !!!next-input-character;
4448     redo A;
4449     }
4450     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4451     if ($is_space->{$self->{nc}}) {
4452     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4453     !!!next-input-character;
4454     redo A;
4455     } elsif ($self->{nc} == 0x0023) { # #
4456     !!!parse-error (type => 'no space before default value'); ## TODO: type
4457     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4458     !!!next-input-character;
4459     redo A;
4460     } elsif ($self->{nc} == 0x0022) { # "
4461     !!!parse-error (type => 'no space before default value'); ## TODO: type
4462     $self->{ca}->{value} = '';
4463     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4464     !!!next-input-character;
4465     redo A;
4466     } elsif ($self->{nc} == 0x0027) { # '
4467     !!!parse-error (type => 'no space before default value'); ## TODO: type
4468     $self->{ca}->{value} = '';
4469     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4470     !!!next-input-character;
4471     redo A;
4472     } elsif ($self->{nc} == 0x003E) { # >
4473     !!!parse-error (type => 'no attr default'); ## TODO: type
4474     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4475     !!!next-input-character;
4476     !!!emit ($self->{ct}); # ATTLIST
4477     redo A;
4478     } elsif ($self->{nc} == -1) {
4479     !!!parse-error (type => 'unclosed md'); ## TODO: type
4480     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4481     !!!next-input-character;
4482     !!!emit ($self->{ct});
4483     redo A;
4484     } else {
4485     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4486     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4487     ## Reconsume.
4488     redo A;
4489     }
4490     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4491     if ($is_space->{$self->{nc}}) {
4492     ## Stay in the state.
4493     !!!next-input-character;
4494     redo A;
4495     } elsif ($self->{nc} == 0x0023) { # #
4496     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4497     !!!next-input-character;
4498     redo A;
4499     } elsif ($self->{nc} == 0x0022) { # "
4500     $self->{ca}->{value} = '';
4501     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4502     !!!next-input-character;
4503     redo A;
4504     } elsif ($self->{nc} == 0x0027) { # '
4505     $self->{ca}->{value} = '';
4506     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4507     !!!next-input-character;
4508     redo A;
4509     } elsif ($self->{nc} == 0x003E) { # >
4510     !!!parse-error (type => 'no attr default'); ## TODO: type
4511     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4512     !!!next-input-character;
4513     !!!emit ($self->{ct}); # ATTLIST
4514     redo A;
4515     } elsif ($self->{nc} == -1) {
4516     !!!parse-error (type => 'unclosed md'); ## TODO: type
4517     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4518     !!!next-input-character;
4519     !!!emit ($self->{ct});
4520     redo A;
4521     } else {
4522     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4523     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4524     ## Reconsume.
4525     redo A;
4526     }
4527     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4528     if ($is_space->{$self->{nc}}) {
4529     ## XML5: No parse error.
4530     !!!parse-error (type => 'no default type'); ## TODO: type
4531 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4532 wakaba 1.14 ## Reconsume.
4533     redo A;
4534 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4535     ## XML5: Same as "anything else".
4536     $self->{ca}->{value} = '';
4537     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4538     !!!next-input-character;
4539     redo A;
4540     } elsif ($self->{nc} == 0x0027) { # '
4541     ## XML5: Same as "anything else".
4542     $self->{ca}->{value} = '';
4543     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4544     !!!next-input-character;
4545     redo A;
4546     } elsif ($self->{nc} == 0x003E) { # >
4547     ## XML5: Same as "anything else".
4548     !!!parse-error (type => 'no attr default'); ## TODO: type
4549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4550     !!!next-input-character;
4551     !!!emit ($self->{ct}); # ATTLIST
4552     redo A;
4553     } elsif ($self->{nc} == -1) {
4554     ## XML5: No parse error.
4555     !!!parse-error (type => 'unclosed md'); ## TODO: type
4556     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4557     !!!next-input-character;
4558     !!!emit ($self->{ct});
4559     redo A;
4560     } else {
4561     $self->{ca}->{default} = chr $self->{nc};
4562     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4563     !!!next-input-character;
4564     redo A;
4565 wakaba 1.14 }
4566 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4567     if ($is_space->{$self->{nc}}) {
4568     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4569     !!!next-input-character;
4570     redo A;
4571     } elsif ($self->{nc} == 0x0022) { # "
4572     ## XML5: Same as "anything else".
4573     !!!parse-error (type => 'no space before default value'); ## TODO: type
4574     $self->{ca}->{value} = '';
4575     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4576     !!!next-input-character;
4577     redo A;
4578     } elsif ($self->{nc} == 0x0027) { # '
4579     ## XML5: Same as "anything else".
4580     !!!parse-error (type => 'no space before default value'); ## TODO: type
4581     $self->{ca}->{value} = '';
4582     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4583     !!!next-input-character;
4584     redo A;
4585     } elsif ($self->{nc} == 0x003E) { # >
4586     ## XML5: Same as "anything else".
4587     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4588     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4589     !!!next-input-character;
4590     !!!emit ($self->{ct}); # ATTLIST
4591     redo A;
4592     } elsif ($self->{nc} == -1) {
4593     ## XML5: No parse error.
4594     !!!parse-error (type => 'unclosed md'); ## TODO: type
4595     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4597     !!!next-input-character;
4598     !!!emit ($self->{ct});
4599     redo A;
4600     } else {
4601     $self->{ca}->{default} .= chr $self->{nc};
4602     ## Stay in the state.
4603     !!!next-input-character;
4604     redo A;
4605     }
4606     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4607     if ($is_space->{$self->{nc}}) {
4608     ## Stay in the state.
4609     !!!next-input-character;
4610     redo A;
4611     } elsif ($self->{nc} == 0x0022) { # "
4612     $self->{ca}->{value} = '';
4613     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4614     !!!next-input-character;
4615     redo A;
4616     } elsif ($self->{nc} == 0x0027) { # '
4617     $self->{ca}->{value} = '';
4618     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4619     !!!next-input-character;
4620     redo A;
4621     } elsif ($self->{nc} == 0x003E) { # >
4622     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4623     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4624     !!!next-input-character;
4625     !!!emit ($self->{ct}); # ATTLIST
4626     redo A;
4627     } elsif ($self->{nc} == -1) {
4628     ## XML5: No parse error.
4629     !!!parse-error (type => 'unclosed md'); ## TODO: type
4630     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4631     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4632     !!!next-input-character;
4633     !!!emit ($self->{ct});
4634     redo A;
4635     } else {
4636     ## XML5: Not defined yet.
4637     if ($self->{ca}->{default} eq 'FIXED') {
4638     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4639     } else {
4640     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4641     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4642     }
4643     ## Reconsume.
4644     redo A;
4645     }
4646     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4647     if ($is_space->{$self->{nc}} or
4648     $self->{nc} == -1 or
4649     $self->{nc} == 0x003E) { # >
4650     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4651     ## Reconsume.
4652     redo A;
4653     } else {
4654     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4655     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4656     ## Reconsume.
4657     redo A;
4658 wakaba 1.16 }
4659 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4660     ## ASCII case-insensitive
4661     if ($self->{nc} == [
4662     undef,
4663     0x0044, # D
4664     0x0041, # A
4665     0x0054, # T
4666     ]->[length $self->{kwd}] or
4667     $self->{nc} == [
4668     undef,
4669     0x0064, # d
4670     0x0061, # a
4671     0x0074, # t
4672     ]->[length $self->{kwd}]) {
4673     !!!cp (172.2);
4674     ## Stay in the state.
4675     $self->{kwd} .= chr $self->{nc};
4676     !!!next-input-character;
4677     redo A;
4678     } elsif ((length $self->{kwd}) == 4 and
4679     ($self->{nc} == 0x0041 or # A
4680     $self->{nc} == 0x0061)) { # a
4681     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4682     !!!cp (172.3);
4683     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4684     text => 'NDATA',
4685     line => $self->{line_prev},
4686     column => $self->{column_prev} - 4);
4687     } else {
4688     !!!cp (172.4);
4689     }
4690     $self->{state} = AFTER_NDATA_STATE;
4691     !!!next-input-character;
4692     redo A;
4693     } else {
4694     !!!parse-error (type => 'string after literal', ## TODO: type
4695     line => $self->{line_prev},
4696     column => $self->{column_prev} + 1
4697     - length $self->{kwd});
4698     !!!cp (172.5);
4699     $self->{state} = BOGUS_MD_STATE;
4700     ## Reconsume.
4701     redo A;
4702     }
4703     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4704     if ($is_space->{$self->{nc}}) {
4705     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4706     !!!next-input-character;
4707     redo A;
4708     } elsif ($self->{nc} == 0x003E) { # >
4709     !!!parse-error (type => 'no notation name'); ## TODO: type
4710     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4711     !!!next-input-character;
4712     !!!emit ($self->{ct}); # ENTITY
4713     redo A;
4714     } elsif ($self->{nc} == -1) {
4715     !!!parse-error (type => 'unclosed md'); ## TODO: type
4716     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4717     !!!next-input-character;
4718     !!!emit ($self->{ct}); # ENTITY
4719     redo A;
4720     } else {
4721     !!!parse-error (type => 'string after literal', ## TODO: type
4722     line => $self->{line_prev},
4723     column => $self->{column_prev} + 1
4724     - length $self->{kwd});
4725     $self->{state} = BOGUS_MD_STATE;
4726     ## Reconsume.
4727     redo A;
4728     }
4729     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4730     if ($is_space->{$self->{nc}}) {
4731     ## Stay in the state.
4732     !!!next-input-character;
4733     redo A;
4734     } elsif ($self->{nc} == 0x003E) { # >
4735     !!!parse-error (type => 'no notation name'); ## TODO: type
4736     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4737     !!!next-input-character;
4738     !!!emit ($self->{ct}); # ENTITY
4739     redo A;
4740     } elsif ($self->{nc} == -1) {
4741     !!!parse-error (type => 'unclosed md'); ## TODO: type
4742     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4743     !!!next-input-character;
4744     !!!emit ($self->{ct}); # ENTITY
4745     redo A;
4746     } else {
4747     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4748     $self->{state} = NOTATION_NAME_STATE;
4749     !!!next-input-character;
4750     redo A;
4751     }
4752     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4753     if ($is_space->{$self->{nc}}) {
4754 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4755 wakaba 1.18 !!!next-input-character;
4756     redo A;
4757     } elsif ($self->{nc} == 0x003E) { # >
4758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4759     !!!next-input-character;
4760     !!!emit ($self->{ct}); # ENTITY
4761     redo A;
4762     } elsif ($self->{nc} == -1) {
4763     !!!parse-error (type => 'unclosed md'); ## TODO: type
4764     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4765     !!!next-input-character;
4766     !!!emit ($self->{ct}); # ENTITY
4767     redo A;
4768     } else {
4769     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4770     ## Stay in the state.
4771     !!!next-input-character;
4772     redo A;
4773     }
4774 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4775     if ($self->{nc} == 0x0022) { # "
4776 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4777 wakaba 1.19 !!!next-input-character;
4778     redo A;
4779     } elsif ($self->{nc} == 0x0026) { # &
4780     $self->{prev_state} = $self->{state};
4781     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4782     $self->{entity_add} = 0x0022; # "
4783     !!!next-input-character;
4784     redo A;
4785     ## TODO: %
4786     } elsif ($self->{nc} == -1) {
4787     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4788     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4789     ## Reconsume.
4790     !!!emit ($self->{ct}); # ENTITY
4791     redo A;
4792     } else {
4793     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4794     !!!next-input-character;
4795     redo A;
4796     }
4797     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4798     if ($self->{nc} == 0x0027) { # '
4799 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4800 wakaba 1.19 !!!next-input-character;
4801     redo A;
4802     } elsif ($self->{nc} == 0x0026) { # &
4803     $self->{prev_state} = $self->{state};
4804     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4805     $self->{entity_add} = 0x0027; # '
4806     !!!next-input-character;
4807     redo A;
4808     ## TODO: %
4809     } elsif ($self->{nc} == -1) {
4810     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4811     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4812     ## Reconsume.
4813     !!!emit ($self->{ct}); # ENTITY
4814     redo A;
4815     } else {
4816     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4817     !!!next-input-character;
4818     redo A;
4819     }
4820     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4821     if ($is_space->{$self->{nc}} or
4822     {
4823     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4824     $self->{entity_add} => 1,
4825     }->{$self->{nc}}) {
4826 wakaba 1.22 !!!parse-error (type => 'bare ero',
4827     line => $self->{line_prev},
4828     column => $self->{column_prev}
4829     + ($self->{nc} == -1 ? 1 : 0));
4830 wakaba 1.19 ## Don't consume
4831     ## Return nothing.
4832     #
4833     } elsif ($self->{nc} == 0x0023) { # #
4834     $self->{ca} = $self->{ct};
4835     $self->{state} = ENTITY_HASH_STATE;
4836     $self->{kwd} = '#';
4837     !!!next-input-character;
4838     redo A;
4839     } else {
4840     #
4841     }
4842    
4843     $self->{ct}->{value} .= '&';
4844     $self->{state} = $self->{prev_state};
4845     ## Reconsume.
4846     redo A;
4847 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4848     if ($is_space->{$self->{nc}}) {
4849     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4850     !!!next-input-character;
4851     redo A;
4852     } elsif ($self->{nc} == 0x0028) { # (
4853     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4854     $self->{ct}->{content} = ['('];
4855     $self->{group_depth} = 1;
4856     !!!next-input-character;
4857     redo A;
4858     } elsif ($self->{nc} == 0x003E) { # >
4859     !!!parse-error (type => 'no md def'); ## TODO: type
4860     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4861     !!!next-input-character;
4862     !!!emit ($self->{ct}); # ELEMENT
4863     redo A;
4864     } elsif ($self->{nc} == -1) {
4865     !!!parse-error (type => 'unclosed md'); ## TODO: type
4866     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4867     !!!next-input-character;
4868     !!!emit ($self->{ct}); # ELEMENT
4869     redo A;
4870     } else {
4871     $self->{ct}->{content} = [chr $self->{nc}];
4872     $self->{state} = CONTENT_KEYWORD_STATE;
4873     !!!next-input-character;
4874     redo A;
4875     }
4876     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4877     if ($is_space->{$self->{nc}}) {
4878     $self->{state} = AFTER_MD_DEF_STATE;
4879     !!!next-input-character;
4880     redo A;
4881     } elsif ($self->{nc} == 0x003E) { # >
4882     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4883     !!!next-input-character;
4884     !!!emit ($self->{ct}); # ELEMENT
4885     redo A;
4886     } elsif ($self->{nc} == -1) {
4887     !!!parse-error (type => 'unclosed md'); ## TODO: type
4888     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4889     !!!next-input-character;
4890     !!!emit ($self->{ct}); # ELEMENT
4891     redo A;
4892     } else {
4893     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4894     ## Stay in the state.
4895     !!!next-input-character;
4896     redo A;
4897     }
4898     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4899     if ($is_space->{$self->{nc}}) {
4900     ## Stay in the state.
4901     !!!next-input-character;
4902     redo A;
4903     } elsif ($self->{nc} == 0x0028) { # (
4904     $self->{group_depth}++;
4905     push @{$self->{ct}->{content}}, chr $self->{nc};
4906     ## Stay in the state.
4907     !!!next-input-character;
4908     redo A;
4909     } elsif ($self->{nc} == 0x007C or # |
4910     $self->{nc} == 0x002C) { # ,
4911     !!!parse-error (type => 'empty element name'); ## TODO: type
4912     ## Stay in the state.
4913     !!!next-input-character;
4914     redo A;
4915     } elsif ($self->{nc} == 0x0029) { # )
4916     !!!parse-error (type => 'empty element name'); ## TODO: type
4917     push @{$self->{ct}->{content}}, chr $self->{nc};
4918     $self->{group_depth}--;
4919     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4920     !!!next-input-character;
4921     redo A;
4922     } elsif ($self->{nc} == 0x003E) { # >
4923     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4924     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4926     !!!next-input-character;
4927     !!!emit ($self->{ct}); # ELEMENT
4928     redo A;
4929     } elsif ($self->{nc} == -1) {
4930     !!!parse-error (type => 'unclosed md'); ## TODO: type
4931     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4932     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4933     !!!next-input-character;
4934     !!!emit ($self->{ct}); # ELEMENT
4935     redo A;
4936     } else {
4937     push @{$self->{ct}->{content}}, chr $self->{nc};
4938     $self->{state} = CM_ELEMENT_NAME_STATE;
4939     !!!next-input-character;
4940     redo A;
4941     }
4942     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4943     if ($is_space->{$self->{nc}}) {
4944     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4945     !!!next-input-character;
4946     redo A;
4947     } elsif ($self->{nc} == 0x002A or # *
4948     $self->{nc} == 0x002B or # +
4949     $self->{nc} == 0x003F) { # ?
4950     push @{$self->{ct}->{content}}, chr $self->{nc};
4951     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4952     !!!next-input-character;
4953     redo A;
4954     } elsif ($self->{nc} == 0x007C or # |
4955     $self->{nc} == 0x002C) { # ,
4956     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4957     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4958     !!!next-input-character;
4959     redo A;
4960     } elsif ($self->{nc} == 0x0029) { # )
4961     $self->{group_depth}--;
4962     push @{$self->{ct}->{content}}, chr $self->{nc};
4963     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4964     !!!next-input-character;
4965     redo A;
4966     } elsif ($self->{nc} == 0x003E) { # >
4967     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4968     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4970     !!!next-input-character;
4971     !!!emit ($self->{ct}); # ELEMENT
4972     redo A;
4973     } elsif ($self->{nc} == -1) {
4974     !!!parse-error (type => 'unclosed md'); ## TODO: type
4975     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4976     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4977     !!!next-input-character;
4978     !!!emit ($self->{ct}); # ELEMENT
4979     redo A;
4980     } else {
4981     $self->{ct}->{content}->[-1] .= chr $self->{nc};
4982     ## Stay in the state.
4983     !!!next-input-character;
4984     redo A;
4985     }
4986     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4987     if ($is_space->{$self->{nc}}) {
4988     ## Stay in the state.
4989     !!!next-input-character;
4990     redo A;
4991     } elsif ($self->{nc} == 0x007C or # |
4992     $self->{nc} == 0x002C) { # ,
4993     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4994     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4995     !!!next-input-character;
4996     redo A;
4997     } elsif ($self->{nc} == 0x0029) { # )
4998     $self->{group_depth}--;
4999     push @{$self->{ct}->{content}}, chr $self->{nc};
5000     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5001     !!!next-input-character;
5002     redo A;
5003     } elsif ($self->{nc} == 0x003E) { # >
5004     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5005     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5006     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5007     !!!next-input-character;
5008     !!!emit ($self->{ct}); # ELEMENT
5009     redo A;
5010     } elsif ($self->{nc} == -1) {
5011     !!!parse-error (type => 'unclosed md'); ## TODO: type
5012     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5013     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5014     !!!next-input-character;
5015     !!!emit ($self->{ct}); # ELEMENT
5016     redo A;
5017     } else {
5018     !!!parse-error (type => 'after element name'); ## TODO: type
5019     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5020     $self->{state} = BOGUS_MD_STATE;
5021     !!!next-input-character;
5022     redo A;
5023     }
5024     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5025     if ($is_space->{$self->{nc}}) {
5026     if ($self->{group_depth}) {
5027     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5028     } else {
5029     $self->{state} = AFTER_MD_DEF_STATE;
5030     }
5031     !!!next-input-character;
5032     redo A;
5033     } elsif ($self->{nc} == 0x002A or # *
5034     $self->{nc} == 0x002B or # +
5035     $self->{nc} == 0x003F) { # ?
5036     push @{$self->{ct}->{content}}, chr $self->{nc};
5037     if ($self->{group_depth}) {
5038     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5039     } else {
5040     $self->{state} = AFTER_MD_DEF_STATE;
5041     }
5042     !!!next-input-character;
5043     redo A;
5044     } elsif ($self->{nc} == 0x0029) { # )
5045     if ($self->{group_depth}) {
5046     $self->{group_depth}--;
5047     push @{$self->{ct}->{content}}, chr $self->{nc};
5048     ## Stay in the state.
5049     !!!next-input-character;
5050     redo A;
5051     } else {
5052     !!!parse-error (type => 'string after md def'); ## TODO: type
5053     $self->{state} = BOGUS_MD_STATE;
5054     ## Reconsume.
5055     redo A;
5056     }
5057     } elsif ($self->{nc} == 0x003E) { # >
5058     if ($self->{group_depth}) {
5059     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5060     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5061     }
5062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5063     !!!next-input-character;
5064     !!!emit ($self->{ct}); # ELEMENT
5065     redo A;
5066     } elsif ($self->{nc} == -1) {
5067     !!!parse-error (type => 'unclosed md'); ## TODO: type
5068     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5070     !!!next-input-character;
5071     !!!emit ($self->{ct}); # ELEMENT
5072     redo A;
5073     } else {
5074     if ($self->{group_depth}) {
5075     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5076     } else {
5077     !!!parse-error (type => 'string after md def'); ## TODO: type
5078     $self->{state} = BOGUS_MD_STATE;
5079     }
5080     ## Reconsume.
5081     redo A;
5082     }
5083     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5084 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5085     ## Stay in the state.
5086     !!!next-input-character;
5087     redo A;
5088     } elsif ($self->{nc} == 0x003E) { # >
5089     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5090     !!!next-input-character;
5091 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5092 wakaba 1.18 redo A;
5093     } elsif ($self->{nc} == -1) {
5094     !!!parse-error (type => 'unclosed md'); ## TODO: type
5095     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5096     !!!next-input-character;
5097 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5098 wakaba 1.18 redo A;
5099     } else {
5100 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5101 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5102     ## Reconsume.
5103     redo A;
5104     }
5105 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5106     if ($self->{nc} == 0x003E) { # >
5107     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5108     !!!next-input-character;
5109     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5110     redo A;
5111     } elsif ($self->{nc} == -1) {
5112     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5113     ## Reconsume.
5114     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5115     redo A;
5116     } else {
5117     ## Stay in the state.
5118     !!!next-input-character;
5119     redo A;
5120     }
5121 wakaba 1.1 } else {
5122     die "$0: $self->{state}: Unknown state";
5123     }
5124     } # A
5125    
5126     die "$0: _get_next_token: unexpected case";
5127     } # _get_next_token
5128    
5129     1;
5130 wakaba 1.25 ## $Date: 2008/10/19 14:05:20 $
5131 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24