/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (hide annotations) (download) (as text)
Sat Oct 18 11:34:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.15: +321 -175 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	18 Oct 2008 11:34:40 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/notations-1.dat" added.

++ whatpm/t/xml/ChangeLog	18 Oct 2008 11:25:41 -0000
	* attlists-1.dat: A test result updated.

	* notations-1.dat: New test result file.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Oct 2008 11:31:41 -0000
	* NanoDOM.pm (public_id, system_id): New attributes.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	18 Oct 2008 11:34:04 -0000
	* Tokenizer.pm.src: Modifies PUBLIC/SYSTEM identifier tokenizer
	states such that <!ENTITY> and <!NOTATION> can be tokenized by
	those states as well.
	(BOGUS_MD_STATE): A new state; used for bogus markup declarations,
	in favor of BOGUS_COMMENT_STATE.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	18 Oct 2008 11:34:26 -0000
	* Parser.pm.src: Set public_id and system_id attributes of Entity
	and Notation nodes.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.16 our $VERSION=do{my @r=(q$Revision: 1.15 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.16 sub BOGUS_MD_STATE () { 85 }
181 wakaba 1.8
182 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
183     ## list and descriptions)
184    
185     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
186     sub FOREIGN_EL () { 0b1_00000000000 }
187    
188     ## Character reference mappings
189    
190     my $charref_map = {
191     0x0D => 0x000A,
192     0x80 => 0x20AC,
193     0x81 => 0xFFFD,
194     0x82 => 0x201A,
195     0x83 => 0x0192,
196     0x84 => 0x201E,
197     0x85 => 0x2026,
198     0x86 => 0x2020,
199     0x87 => 0x2021,
200     0x88 => 0x02C6,
201     0x89 => 0x2030,
202     0x8A => 0x0160,
203     0x8B => 0x2039,
204     0x8C => 0x0152,
205     0x8D => 0xFFFD,
206     0x8E => 0x017D,
207     0x8F => 0xFFFD,
208     0x90 => 0xFFFD,
209     0x91 => 0x2018,
210     0x92 => 0x2019,
211     0x93 => 0x201C,
212     0x94 => 0x201D,
213     0x95 => 0x2022,
214     0x96 => 0x2013,
215     0x97 => 0x2014,
216     0x98 => 0x02DC,
217     0x99 => 0x2122,
218     0x9A => 0x0161,
219     0x9B => 0x203A,
220     0x9C => 0x0153,
221     0x9D => 0xFFFD,
222     0x9E => 0x017E,
223     0x9F => 0x0178,
224     }; # $charref_map
225     $charref_map->{$_} = 0xFFFD
226     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
227     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
228     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
229     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
230     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
231     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
232     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
233    
234     ## Implementations MUST act as if state machine in the spec
235    
236     sub _initialize_tokenizer ($) {
237     my $self = shift;
238    
239     ## NOTE: Fields set by |new| constructor:
240     #$self->{level}
241     #$self->{set_nc}
242     #$self->{parse_error}
243 wakaba 1.3 #$self->{is_xml} (if XML)
244 wakaba 1.1
245     $self->{state} = DATA_STATE; # MUST
246 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
247     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248 wakaba 1.1 #$self->{entity__value}; # initialized when used
249     #$self->{entity__match}; # initialized when used
250     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
251     undef $self->{ct}; # current token
252     undef $self->{ca}; # current attribute
253     undef $self->{last_stag_name}; # last emitted start tag name
254     #$self->{prev_state}; # initialized when used
255     delete $self->{self_closing};
256     $self->{char_buffer} = '';
257     $self->{char_buffer_pos} = 0;
258     $self->{nc} = -1; # next input character
259     #$self->{next_nc}
260     !!!next-input-character;
261     $self->{token} = [];
262     # $self->{escape}
263     } # _initialize_tokenizer
264    
265     ## A token has:
266     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
267 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
268 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
269     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
270 wakaba 1.11 ## ->{target} (PI_TOKEN)
271 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
272     ## ->{sysid} (DOCTYPE_TOKEN)
273     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
274     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
275     ## ->{name}
276     ## ->{value}
277     ## ->{has_reference} == 1 or 0
278 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
279     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
280 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
281 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
282 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
283    
284 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
285     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
286     ## while the token is pushed back to the stack.
287    
288     ## Emitted token MUST immediately be handled by the tree construction state.
289    
290     ## Before each step, UA MAY check to see if either one of the scripts in
291     ## "list of scripts that will execute as soon as possible" or the first
292     ## script in the "list of scripts that will execute asynchronously",
293     ## has completed loading. If one has, then it MUST be executed
294     ## and removed from the list.
295    
296     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
297     ## (This requirement was dropped from HTML5 spec, unfortunately.)
298    
299     my $is_space = {
300     0x0009 => 1, # CHARACTER TABULATION (HT)
301     0x000A => 1, # LINE FEED (LF)
302     #0x000B => 0, # LINE TABULATION (VT)
303 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
304 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
305     0x0020 => 1, # SPACE (SP)
306     };
307    
308     sub _get_next_token ($) {
309     my $self = shift;
310    
311     if ($self->{self_closing}) {
312     !!!parse-error (type => 'nestc', token => $self->{ct});
313     ## NOTE: The |self_closing| flag is only set by start tag token.
314     ## In addition, when a start tag token is emitted, it is always set to
315     ## |ct|.
316     delete $self->{self_closing};
317     }
318    
319     if (@{$self->{token}}) {
320     $self->{self_closing} = $self->{token}->[0]->{self_closing};
321     return shift @{$self->{token}};
322     }
323    
324     A: {
325     if ($self->{state} == PCDATA_STATE) {
326     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
327    
328     if ($self->{nc} == 0x0026) { # &
329     !!!cp (0.1);
330     ## NOTE: In the spec, the tokenizer is switched to the
331     ## "entity data state". In this implementation, the tokenizer
332     ## is switched to the |ENTITY_STATE|, which is an implementation
333     ## of the "consume a character reference" algorithm.
334     $self->{entity_add} = -1;
335     $self->{prev_state} = DATA_STATE;
336     $self->{state} = ENTITY_STATE;
337     !!!next-input-character;
338     redo A;
339     } elsif ($self->{nc} == 0x003C) { # <
340     !!!cp (0.2);
341     $self->{state} = TAG_OPEN_STATE;
342     !!!next-input-character;
343     redo A;
344     } elsif ($self->{nc} == -1) {
345     !!!cp (0.3);
346     !!!emit ({type => END_OF_FILE_TOKEN,
347     line => $self->{line}, column => $self->{column}});
348     last A; ## TODO: ok?
349     } else {
350     !!!cp (0.4);
351     #
352     }
353    
354     # Anything else
355     my $token = {type => CHARACTER_TOKEN,
356     data => chr $self->{nc},
357     line => $self->{line}, column => $self->{column},
358     };
359     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
360    
361     ## Stay in the state.
362     !!!next-input-character;
363     !!!emit ($token);
364     redo A;
365     } elsif ($self->{state} == DATA_STATE) {
366     $self->{s_kwd} = '' unless defined $self->{s_kwd};
367     if ($self->{nc} == 0x0026) { # &
368     $self->{s_kwd} = '';
369     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
370     not $self->{escape}) {
371     !!!cp (1);
372     ## NOTE: In the spec, the tokenizer is switched to the
373     ## "entity data state". In this implementation, the tokenizer
374     ## is switched to the |ENTITY_STATE|, which is an implementation
375     ## of the "consume a character reference" algorithm.
376     $self->{entity_add} = -1;
377     $self->{prev_state} = DATA_STATE;
378     $self->{state} = ENTITY_STATE;
379     !!!next-input-character;
380     redo A;
381     } else {
382     !!!cp (2);
383     #
384     }
385     } elsif ($self->{nc} == 0x002D) { # -
386     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
387 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
388 wakaba 1.1 !!!cp (3);
389     $self->{escape} = 1; # unless $self->{escape};
390     $self->{s_kwd} = '--';
391     #
392 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
393 wakaba 1.1 !!!cp (4);
394     $self->{s_kwd} = '--';
395     #
396 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
397     !!!cp (4.1);
398     $self->{s_kwd} .= '-';
399     #
400 wakaba 1.1 } else {
401     !!!cp (5);
402 wakaba 1.5 $self->{s_kwd} = '-';
403 wakaba 1.1 #
404     }
405     }
406    
407     #
408     } elsif ($self->{nc} == 0x0021) { # !
409     if (length $self->{s_kwd}) {
410     !!!cp (5.1);
411     $self->{s_kwd} .= '!';
412     #
413     } else {
414     !!!cp (5.2);
415     #$self->{s_kwd} = '';
416     #
417     }
418     #
419     } elsif ($self->{nc} == 0x003C) { # <
420     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
421     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
422     not $self->{escape})) {
423     !!!cp (6);
424     $self->{state} = TAG_OPEN_STATE;
425     !!!next-input-character;
426     redo A;
427     } else {
428     !!!cp (7);
429     $self->{s_kwd} = '';
430     #
431     }
432     } elsif ($self->{nc} == 0x003E) { # >
433     if ($self->{escape} and
434     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
435     if ($self->{s_kwd} eq '--') {
436     !!!cp (8);
437     delete $self->{escape};
438 wakaba 1.5 #
439 wakaba 1.1 } else {
440     !!!cp (9);
441 wakaba 1.5 #
442 wakaba 1.1 }
443 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444     !!!cp (9.1);
445     !!!parse-error (type => 'unmatched mse', ## TODO: type
446     line => $self->{line_prev},
447     column => $self->{column_prev} - 1);
448     #
449 wakaba 1.1 } else {
450     !!!cp (10);
451 wakaba 1.5 #
452 wakaba 1.1 }
453    
454     $self->{s_kwd} = '';
455     #
456 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
457     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458     !!!cp (10.1);
459     $self->{s_kwd} .= ']';
460     } elsif ($self->{s_kwd} eq ']]') {
461     !!!cp (10.2);
462     #
463     } else {
464     !!!cp (10.3);
465     $self->{s_kwd} = '';
466     }
467     #
468 wakaba 1.1 } elsif ($self->{nc} == -1) {
469     !!!cp (11);
470     $self->{s_kwd} = '';
471     !!!emit ({type => END_OF_FILE_TOKEN,
472     line => $self->{line}, column => $self->{column}});
473     last A; ## TODO: ok?
474     } else {
475     !!!cp (12);
476     $self->{s_kwd} = '';
477     #
478     }
479    
480     # Anything else
481     my $token = {type => CHARACTER_TOKEN,
482     data => chr $self->{nc},
483     line => $self->{line}, column => $self->{column},
484     };
485 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486 wakaba 1.1 length $token->{data})) {
487     $self->{s_kwd} = '';
488     }
489    
490     ## Stay in the data state.
491 wakaba 1.5 if (not $self->{is_xml} and
492     $self->{content_model} == PCDATA_CONTENT_MODEL) {
493 wakaba 1.1 !!!cp (13);
494     $self->{state} = PCDATA_STATE;
495     } else {
496     !!!cp (14);
497     ## Stay in the state.
498     }
499     !!!next-input-character;
500     !!!emit ($token);
501     redo A;
502     } elsif ($self->{state} == TAG_OPEN_STATE) {
503 wakaba 1.10 ## XML5: "tag state".
504    
505 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
506     if ($self->{nc} == 0x002F) { # /
507     !!!cp (15);
508     !!!next-input-character;
509     $self->{state} = CLOSE_TAG_OPEN_STATE;
510     redo A;
511     } elsif ($self->{nc} == 0x0021) { # !
512     !!!cp (15.1);
513 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
514 wakaba 1.1 #
515     } else {
516     !!!cp (16);
517 wakaba 1.12 $self->{s_kwd} = '';
518 wakaba 1.1 #
519     }
520    
521     ## reconsume
522     $self->{state} = DATA_STATE;
523     !!!emit ({type => CHARACTER_TOKEN, data => '<',
524     line => $self->{line_prev},
525     column => $self->{column_prev},
526     });
527     redo A;
528     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
529     if ($self->{nc} == 0x0021) { # !
530     !!!cp (17);
531     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
532     !!!next-input-character;
533     redo A;
534     } elsif ($self->{nc} == 0x002F) { # /
535     !!!cp (18);
536     $self->{state} = CLOSE_TAG_OPEN_STATE;
537     !!!next-input-character;
538     redo A;
539     } elsif (0x0041 <= $self->{nc} and
540     $self->{nc} <= 0x005A) { # A..Z
541     !!!cp (19);
542     $self->{ct}
543     = {type => START_TAG_TOKEN,
544 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545 wakaba 1.1 line => $self->{line_prev},
546     column => $self->{column_prev}};
547     $self->{state} = TAG_NAME_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif (0x0061 <= $self->{nc} and
551     $self->{nc} <= 0x007A) { # a..z
552     !!!cp (20);
553     $self->{ct} = {type => START_TAG_TOKEN,
554     tag_name => chr ($self->{nc}),
555     line => $self->{line_prev},
556     column => $self->{column_prev}};
557     $self->{state} = TAG_NAME_STATE;
558     !!!next-input-character;
559     redo A;
560     } elsif ($self->{nc} == 0x003E) { # >
561     !!!cp (21);
562     !!!parse-error (type => 'empty start tag',
563     line => $self->{line_prev},
564     column => $self->{column_prev});
565     $self->{state} = DATA_STATE;
566 wakaba 1.5 $self->{s_kwd} = '';
567 wakaba 1.1 !!!next-input-character;
568    
569     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
570     line => $self->{line_prev},
571     column => $self->{column_prev},
572     });
573    
574     redo A;
575     } elsif ($self->{nc} == 0x003F) { # ?
576 wakaba 1.8 if ($self->{is_xml}) {
577     !!!cp (22.1);
578     $self->{state} = PI_STATE;
579     !!!next-input-character;
580     redo A;
581     } else {
582     !!!cp (22);
583     !!!parse-error (type => 'pio',
584     line => $self->{line_prev},
585     column => $self->{column_prev});
586     $self->{state} = BOGUS_COMMENT_STATE;
587     $self->{ct} = {type => COMMENT_TOKEN, data => '',
588     line => $self->{line_prev},
589     column => $self->{column_prev},
590     };
591     ## $self->{nc} is intentionally left as is
592     redo A;
593     }
594 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
595 wakaba 1.1 !!!cp (23);
596     !!!parse-error (type => 'bare stago',
597     line => $self->{line_prev},
598     column => $self->{column_prev});
599     $self->{state} = DATA_STATE;
600 wakaba 1.5 $self->{s_kwd} = '';
601 wakaba 1.1 ## reconsume
602    
603     !!!emit ({type => CHARACTER_TOKEN, data => '<',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     });
607    
608     redo A;
609 wakaba 1.9 } else {
610     ## XML5: "<:" is a parse error.
611     !!!cp (23.1);
612     $self->{ct} = {type => START_TAG_TOKEN,
613     tag_name => chr ($self->{nc}),
614     line => $self->{line_prev},
615     column => $self->{column_prev}};
616     $self->{state} = TAG_NAME_STATE;
617     !!!next-input-character;
618     redo A;
619 wakaba 1.1 }
620     } else {
621     die "$0: $self->{content_model} in tag open";
622     }
623     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
624     ## NOTE: The "close tag open state" in the spec is implemented as
625     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
626    
627 wakaba 1.10 ## XML5: "end tag state".
628    
629 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
630     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
631     if (defined $self->{last_stag_name}) {
632     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
633 wakaba 1.12 $self->{kwd} = '';
634 wakaba 1.1 ## Reconsume.
635     redo A;
636     } else {
637     ## No start tag token has ever been emitted
638     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
639     !!!cp (28);
640     $self->{state} = DATA_STATE;
641 wakaba 1.5 $self->{s_kwd} = '';
642 wakaba 1.1 ## Reconsume.
643     !!!emit ({type => CHARACTER_TOKEN, data => '</',
644     line => $l, column => $c,
645     });
646     redo A;
647     }
648     }
649    
650     if (0x0041 <= $self->{nc} and
651     $self->{nc} <= 0x005A) { # A..Z
652     !!!cp (29);
653     $self->{ct}
654     = {type => END_TAG_TOKEN,
655 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
656 wakaba 1.1 line => $l, column => $c};
657     $self->{state} = TAG_NAME_STATE;
658     !!!next-input-character;
659     redo A;
660     } elsif (0x0061 <= $self->{nc} and
661     $self->{nc} <= 0x007A) { # a..z
662     !!!cp (30);
663     $self->{ct} = {type => END_TAG_TOKEN,
664     tag_name => chr ($self->{nc}),
665     line => $l, column => $c};
666     $self->{state} = TAG_NAME_STATE;
667     !!!next-input-character;
668     redo A;
669     } elsif ($self->{nc} == 0x003E) { # >
670     !!!parse-error (type => 'empty end tag',
671     line => $self->{line_prev}, ## "<" in "</>"
672     column => $self->{column_prev} - 1);
673     $self->{state} = DATA_STATE;
674 wakaba 1.5 $self->{s_kwd} = '';
675 wakaba 1.10 if ($self->{is_xml}) {
676     !!!cp (31);
677     ## XML5: No parse error.
678    
679     ## NOTE: This parser raises a parse error, since it supports
680     ## XML1, not XML5.
681    
682     ## NOTE: A short end tag token.
683     my $ct = {type => END_TAG_TOKEN,
684     tag_name => '',
685     line => $self->{line_prev},
686     column => $self->{column_prev} - 1,
687     };
688     !!!next-input-character;
689     !!!emit ($ct);
690     } else {
691     !!!cp (31.1);
692     !!!next-input-character;
693     }
694 wakaba 1.1 redo A;
695     } elsif ($self->{nc} == -1) {
696     !!!cp (32);
697     !!!parse-error (type => 'bare etago');
698 wakaba 1.5 $self->{s_kwd} = '';
699 wakaba 1.1 $self->{state} = DATA_STATE;
700     # reconsume
701    
702     !!!emit ({type => CHARACTER_TOKEN, data => '</',
703     line => $l, column => $c,
704     });
705    
706     redo A;
707 wakaba 1.10 } elsif (not $self->{is_xml} or
708     $is_space->{$self->{nc}}) {
709 wakaba 1.1 !!!cp (33);
710 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
711     line => $self->{line_prev}, # "<" of "</"
712     column => $self->{column_prev} - 1);
713 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
714     $self->{ct} = {type => COMMENT_TOKEN, data => '',
715     line => $self->{line_prev}, # "<" of "</"
716     column => $self->{column_prev} - 1,
717     };
718     ## NOTE: $self->{nc} is intentionally left as is.
719     ## Although the "anything else" case of the spec not explicitly
720     ## states that the next input character is to be reconsumed,
721     ## it will be included to the |data| of the comment token
722     ## generated from the bogus end tag, as defined in the
723     ## "bogus comment state" entry.
724     redo A;
725 wakaba 1.10 } else {
726     ## XML5: "</:" is a parse error.
727     !!!cp (30.1);
728     $self->{ct} = {type => END_TAG_TOKEN,
729     tag_name => chr ($self->{nc}),
730     line => $l, column => $c};
731     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
732     !!!next-input-character;
733     redo A;
734 wakaba 1.1 }
735     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
736 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
737 wakaba 1.1 if (length $ch) {
738     my $CH = $ch;
739     $ch =~ tr/a-z/A-Z/;
740     my $nch = chr $self->{nc};
741     if ($nch eq $ch or $nch eq $CH) {
742     !!!cp (24);
743     ## Stay in the state.
744 wakaba 1.12 $self->{kwd} .= $nch;
745 wakaba 1.1 !!!next-input-character;
746     redo A;
747     } else {
748     !!!cp (25);
749     $self->{state} = DATA_STATE;
750 wakaba 1.5 $self->{s_kwd} = '';
751 wakaba 1.1 ## Reconsume.
752     !!!emit ({type => CHARACTER_TOKEN,
753 wakaba 1.12 data => '</' . $self->{kwd},
754 wakaba 1.1 line => $self->{line_prev},
755 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
756 wakaba 1.1 });
757     redo A;
758     }
759     } else { # after "<{tag-name}"
760     unless ($is_space->{$self->{nc}} or
761     {
762     0x003E => 1, # >
763     0x002F => 1, # /
764     -1 => 1, # EOF
765     }->{$self->{nc}}) {
766     !!!cp (26);
767     ## Reconsume.
768     $self->{state} = DATA_STATE;
769 wakaba 1.5 $self->{s_kwd} = '';
770 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
771 wakaba 1.12 data => '</' . $self->{kwd},
772 wakaba 1.1 line => $self->{line_prev},
773 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
774 wakaba 1.1 });
775     redo A;
776     } else {
777     !!!cp (27);
778     $self->{ct}
779     = {type => END_TAG_TOKEN,
780     tag_name => $self->{last_stag_name},
781     line => $self->{line_prev},
782 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
783 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
784     ## Reconsume.
785     redo A;
786     }
787     }
788     } elsif ($self->{state} == TAG_NAME_STATE) {
789     if ($is_space->{$self->{nc}}) {
790     !!!cp (34);
791     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
792     !!!next-input-character;
793     redo A;
794     } elsif ($self->{nc} == 0x003E) { # >
795     if ($self->{ct}->{type} == START_TAG_TOKEN) {
796     !!!cp (35);
797     $self->{last_stag_name} = $self->{ct}->{tag_name};
798     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
799     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
800     #if ($self->{ct}->{attributes}) {
801     # ## NOTE: This should never be reached.
802     # !!! cp (36);
803     # !!! parse-error (type => 'end tag attribute');
804     #} else {
805     !!!cp (37);
806     #}
807     } else {
808     die "$0: $self->{ct}->{type}: Unknown token type";
809     }
810     $self->{state} = DATA_STATE;
811 wakaba 1.5 $self->{s_kwd} = '';
812 wakaba 1.1 !!!next-input-character;
813    
814     !!!emit ($self->{ct}); # start tag or end tag
815    
816     redo A;
817     } elsif (0x0041 <= $self->{nc} and
818     $self->{nc} <= 0x005A) { # A..Z
819     !!!cp (38);
820 wakaba 1.4 $self->{ct}->{tag_name}
821     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
822 wakaba 1.1 # start tag or end tag
823     ## Stay in this state
824     !!!next-input-character;
825     redo A;
826     } elsif ($self->{nc} == -1) {
827     !!!parse-error (type => 'unclosed tag');
828     if ($self->{ct}->{type} == START_TAG_TOKEN) {
829     !!!cp (39);
830     $self->{last_stag_name} = $self->{ct}->{tag_name};
831     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
832     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
833     #if ($self->{ct}->{attributes}) {
834     # ## NOTE: This state should never be reached.
835     # !!! cp (40);
836     # !!! parse-error (type => 'end tag attribute');
837     #} else {
838     !!!cp (41);
839     #}
840     } else {
841     die "$0: $self->{ct}->{type}: Unknown token type";
842     }
843     $self->{state} = DATA_STATE;
844 wakaba 1.5 $self->{s_kwd} = '';
845 wakaba 1.1 # reconsume
846    
847     !!!emit ($self->{ct}); # start tag or end tag
848    
849     redo A;
850     } elsif ($self->{nc} == 0x002F) { # /
851     !!!cp (42);
852     $self->{state} = SELF_CLOSING_START_TAG_STATE;
853     !!!next-input-character;
854     redo A;
855     } else {
856     !!!cp (44);
857     $self->{ct}->{tag_name} .= chr $self->{nc};
858     # start tag or end tag
859     ## Stay in the state
860     !!!next-input-character;
861     redo A;
862     }
863     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
864 wakaba 1.11 ## XML5: "Tag attribute name before state".
865    
866 wakaba 1.1 if ($is_space->{$self->{nc}}) {
867     !!!cp (45);
868     ## Stay in the state
869     !!!next-input-character;
870     redo A;
871     } elsif ($self->{nc} == 0x003E) { # >
872     if ($self->{ct}->{type} == START_TAG_TOKEN) {
873     !!!cp (46);
874     $self->{last_stag_name} = $self->{ct}->{tag_name};
875     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
876     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877     if ($self->{ct}->{attributes}) {
878     !!!cp (47);
879     !!!parse-error (type => 'end tag attribute');
880     } else {
881     !!!cp (48);
882     }
883     } else {
884     die "$0: $self->{ct}->{type}: Unknown token type";
885     }
886     $self->{state} = DATA_STATE;
887 wakaba 1.5 $self->{s_kwd} = '';
888 wakaba 1.1 !!!next-input-character;
889    
890     !!!emit ($self->{ct}); # start tag or end tag
891    
892     redo A;
893     } elsif (0x0041 <= $self->{nc} and
894     $self->{nc} <= 0x005A) { # A..Z
895     !!!cp (49);
896     $self->{ca}
897 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
898 wakaba 1.1 value => '',
899     line => $self->{line}, column => $self->{column}};
900     $self->{state} = ATTRIBUTE_NAME_STATE;
901     !!!next-input-character;
902     redo A;
903     } elsif ($self->{nc} == 0x002F) { # /
904     !!!cp (50);
905     $self->{state} = SELF_CLOSING_START_TAG_STATE;
906     !!!next-input-character;
907     redo A;
908     } elsif ($self->{nc} == -1) {
909     !!!parse-error (type => 'unclosed tag');
910     if ($self->{ct}->{type} == START_TAG_TOKEN) {
911     !!!cp (52);
912     $self->{last_stag_name} = $self->{ct}->{tag_name};
913     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
914     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
915     if ($self->{ct}->{attributes}) {
916     !!!cp (53);
917     !!!parse-error (type => 'end tag attribute');
918     } else {
919     !!!cp (54);
920     }
921     } else {
922     die "$0: $self->{ct}->{type}: Unknown token type";
923     }
924     $self->{state} = DATA_STATE;
925 wakaba 1.5 $self->{s_kwd} = '';
926 wakaba 1.1 # reconsume
927    
928     !!!emit ($self->{ct}); # start tag or end tag
929    
930     redo A;
931     } else {
932     if ({
933     0x0022 => 1, # "
934     0x0027 => 1, # '
935     0x003D => 1, # =
936     }->{$self->{nc}}) {
937     !!!cp (55);
938 wakaba 1.11 ## XML5: Not a parse error.
939 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
940     } else {
941     !!!cp (56);
942 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
943 wakaba 1.1 }
944     $self->{ca}
945     = {name => chr ($self->{nc}),
946     value => '',
947     line => $self->{line}, column => $self->{column}};
948     $self->{state} = ATTRIBUTE_NAME_STATE;
949     !!!next-input-character;
950     redo A;
951     }
952     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
953 wakaba 1.11 ## XML5: "Tag attribute name state".
954    
955 wakaba 1.1 my $before_leave = sub {
956     if (exists $self->{ct}->{attributes} # start tag or end tag
957     ->{$self->{ca}->{name}}) { # MUST
958     !!!cp (57);
959     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
960     ## Discard $self->{ca} # MUST
961     } else {
962     !!!cp (58);
963     $self->{ct}->{attributes}->{$self->{ca}->{name}}
964     = $self->{ca};
965 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
966 wakaba 1.1 }
967     }; # $before_leave
968    
969     if ($is_space->{$self->{nc}}) {
970     !!!cp (59);
971     $before_leave->();
972     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
973     !!!next-input-character;
974     redo A;
975     } elsif ($self->{nc} == 0x003D) { # =
976     !!!cp (60);
977     $before_leave->();
978     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{nc} == 0x003E) { # >
982 wakaba 1.11 if ($self->{is_xml}) {
983     !!!cp (60.1);
984     ## XML5: Not a parse error.
985     !!!parse-error (type => 'no attr value'); ## TODO: type
986     } else {
987     !!!cp (60.2);
988     }
989    
990 wakaba 1.1 $before_leave->();
991     if ($self->{ct}->{type} == START_TAG_TOKEN) {
992     !!!cp (61);
993     $self->{last_stag_name} = $self->{ct}->{tag_name};
994     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
995     !!!cp (62);
996     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
997     if ($self->{ct}->{attributes}) {
998     !!!parse-error (type => 'end tag attribute');
999     }
1000     } else {
1001     die "$0: $self->{ct}->{type}: Unknown token type";
1002     }
1003     $self->{state} = DATA_STATE;
1004 wakaba 1.5 $self->{s_kwd} = '';
1005 wakaba 1.1 !!!next-input-character;
1006    
1007     !!!emit ($self->{ct}); # start tag or end tag
1008    
1009     redo A;
1010     } elsif (0x0041 <= $self->{nc} and
1011     $self->{nc} <= 0x005A) { # A..Z
1012     !!!cp (63);
1013 wakaba 1.4 $self->{ca}->{name}
1014     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1015 wakaba 1.1 ## Stay in the state
1016     !!!next-input-character;
1017     redo A;
1018     } elsif ($self->{nc} == 0x002F) { # /
1019 wakaba 1.11 if ($self->{is_xml}) {
1020     !!!cp (64);
1021     ## XML5: Not a parse error.
1022     !!!parse-error (type => 'no attr value'); ## TODO: type
1023     } else {
1024     !!!cp (64.1);
1025     }
1026    
1027 wakaba 1.1 $before_leave->();
1028     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1029     !!!next-input-character;
1030     redo A;
1031     } elsif ($self->{nc} == -1) {
1032     !!!parse-error (type => 'unclosed tag');
1033     $before_leave->();
1034     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1035     !!!cp (66);
1036     $self->{last_stag_name} = $self->{ct}->{tag_name};
1037     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1038     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1039     if ($self->{ct}->{attributes}) {
1040     !!!cp (67);
1041     !!!parse-error (type => 'end tag attribute');
1042     } else {
1043     ## NOTE: This state should never be reached.
1044     !!!cp (68);
1045     }
1046     } else {
1047     die "$0: $self->{ct}->{type}: Unknown token type";
1048     }
1049     $self->{state} = DATA_STATE;
1050 wakaba 1.5 $self->{s_kwd} = '';
1051 wakaba 1.1 # reconsume
1052    
1053     !!!emit ($self->{ct}); # start tag or end tag
1054    
1055     redo A;
1056     } else {
1057     if ($self->{nc} == 0x0022 or # "
1058     $self->{nc} == 0x0027) { # '
1059     !!!cp (69);
1060 wakaba 1.11 ## XML5: Not a parse error.
1061 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1062     } else {
1063     !!!cp (70);
1064     }
1065     $self->{ca}->{name} .= chr ($self->{nc});
1066     ## Stay in the state
1067     !!!next-input-character;
1068     redo A;
1069     }
1070     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1071 wakaba 1.11 ## XML5: "Tag attribute name after state".
1072    
1073 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1074     !!!cp (71);
1075     ## Stay in the state
1076     !!!next-input-character;
1077     redo A;
1078     } elsif ($self->{nc} == 0x003D) { # =
1079     !!!cp (72);
1080     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1081     !!!next-input-character;
1082     redo A;
1083     } elsif ($self->{nc} == 0x003E) { # >
1084 wakaba 1.11 if ($self->{is_xml}) {
1085     !!!cp (72.1);
1086     ## XML5: Not a parse error.
1087     !!!parse-error (type => 'no attr value'); ## TODO: type
1088     } else {
1089     !!!cp (72.2);
1090     }
1091    
1092 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1093     !!!cp (73);
1094     $self->{last_stag_name} = $self->{ct}->{tag_name};
1095     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1096     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1097     if ($self->{ct}->{attributes}) {
1098     !!!cp (74);
1099     !!!parse-error (type => 'end tag attribute');
1100     } else {
1101     ## NOTE: This state should never be reached.
1102     !!!cp (75);
1103     }
1104     } else {
1105     die "$0: $self->{ct}->{type}: Unknown token type";
1106     }
1107     $self->{state} = DATA_STATE;
1108 wakaba 1.5 $self->{s_kwd} = '';
1109 wakaba 1.1 !!!next-input-character;
1110    
1111     !!!emit ($self->{ct}); # start tag or end tag
1112    
1113     redo A;
1114     } elsif (0x0041 <= $self->{nc} and
1115     $self->{nc} <= 0x005A) { # A..Z
1116     !!!cp (76);
1117     $self->{ca}
1118 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1119 wakaba 1.1 value => '',
1120     line => $self->{line}, column => $self->{column}};
1121     $self->{state} = ATTRIBUTE_NAME_STATE;
1122     !!!next-input-character;
1123     redo A;
1124     } elsif ($self->{nc} == 0x002F) { # /
1125 wakaba 1.11 if ($self->{is_xml}) {
1126     !!!cp (77);
1127     ## XML5: Not a parse error.
1128     !!!parse-error (type => 'no attr value'); ## TODO: type
1129     } else {
1130     !!!cp (77.1);
1131     }
1132    
1133 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1134     !!!next-input-character;
1135     redo A;
1136     } elsif ($self->{nc} == -1) {
1137     !!!parse-error (type => 'unclosed tag');
1138     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1139     !!!cp (79);
1140     $self->{last_stag_name} = $self->{ct}->{tag_name};
1141     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1142     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1143     if ($self->{ct}->{attributes}) {
1144     !!!cp (80);
1145     !!!parse-error (type => 'end tag attribute');
1146     } else {
1147     ## NOTE: This state should never be reached.
1148     !!!cp (81);
1149     }
1150     } else {
1151     die "$0: $self->{ct}->{type}: Unknown token type";
1152     }
1153 wakaba 1.5 $self->{s_kwd} = '';
1154 wakaba 1.1 $self->{state} = DATA_STATE;
1155     # reconsume
1156    
1157     !!!emit ($self->{ct}); # start tag or end tag
1158    
1159     redo A;
1160     } else {
1161 wakaba 1.11 if ($self->{is_xml}) {
1162     !!!cp (78.1);
1163     ## XML5: Not a parse error.
1164     !!!parse-error (type => 'no attr value'); ## TODO: type
1165     } else {
1166     !!!cp (78.2);
1167     }
1168    
1169 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1170     $self->{nc} == 0x0027) { # '
1171     !!!cp (78);
1172 wakaba 1.11 ## XML5: Not a parse error.
1173 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1174     } else {
1175     !!!cp (82);
1176     }
1177     $self->{ca}
1178     = {name => chr ($self->{nc}),
1179     value => '',
1180     line => $self->{line}, column => $self->{column}};
1181     $self->{state} = ATTRIBUTE_NAME_STATE;
1182     !!!next-input-character;
1183     redo A;
1184     }
1185     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1186 wakaba 1.11 ## XML5: "Tag attribute value before state".
1187    
1188 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1189     !!!cp (83);
1190     ## Stay in the state
1191     !!!next-input-character;
1192     redo A;
1193     } elsif ($self->{nc} == 0x0022) { # "
1194     !!!cp (84);
1195     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1196     !!!next-input-character;
1197     redo A;
1198     } elsif ($self->{nc} == 0x0026) { # &
1199     !!!cp (85);
1200     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1201     ## reconsume
1202     redo A;
1203     } elsif ($self->{nc} == 0x0027) { # '
1204     !!!cp (86);
1205     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1206     !!!next-input-character;
1207     redo A;
1208     } elsif ($self->{nc} == 0x003E) { # >
1209     !!!parse-error (type => 'empty unquoted attribute value');
1210     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211     !!!cp (87);
1212     $self->{last_stag_name} = $self->{ct}->{tag_name};
1213     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215     if ($self->{ct}->{attributes}) {
1216     !!!cp (88);
1217     !!!parse-error (type => 'end tag attribute');
1218     } else {
1219     ## NOTE: This state should never be reached.
1220     !!!cp (89);
1221     }
1222     } else {
1223     die "$0: $self->{ct}->{type}: Unknown token type";
1224     }
1225     $self->{state} = DATA_STATE;
1226 wakaba 1.5 $self->{s_kwd} = '';
1227 wakaba 1.1 !!!next-input-character;
1228    
1229     !!!emit ($self->{ct}); # start tag or end tag
1230    
1231     redo A;
1232     } elsif ($self->{nc} == -1) {
1233     !!!parse-error (type => 'unclosed tag');
1234     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1235     !!!cp (90);
1236     $self->{last_stag_name} = $self->{ct}->{tag_name};
1237     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1238     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1239     if ($self->{ct}->{attributes}) {
1240     !!!cp (91);
1241     !!!parse-error (type => 'end tag attribute');
1242     } else {
1243     ## NOTE: This state should never be reached.
1244     !!!cp (92);
1245     }
1246     } else {
1247     die "$0: $self->{ct}->{type}: Unknown token type";
1248     }
1249     $self->{state} = DATA_STATE;
1250 wakaba 1.5 $self->{s_kwd} = '';
1251 wakaba 1.1 ## reconsume
1252    
1253     !!!emit ($self->{ct}); # start tag or end tag
1254    
1255     redo A;
1256     } else {
1257     if ($self->{nc} == 0x003D) { # =
1258     !!!cp (93);
1259 wakaba 1.11 ## XML5: Not a parse error.
1260 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1261 wakaba 1.11 } elsif ($self->{is_xml}) {
1262     !!!cp (93.1);
1263     ## XML5: No parse error.
1264     !!!parse-error (type => 'unquoted attr value'); ## TODO
1265 wakaba 1.1 } else {
1266     !!!cp (94);
1267     }
1268     $self->{ca}->{value} .= chr ($self->{nc});
1269     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1270     !!!next-input-character;
1271     redo A;
1272     }
1273     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1274 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1275     ## ATTLIST attribute value double quoted state".
1276 wakaba 1.11
1277 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1278 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1279     !!!cp (95.1);
1280     ## XML5: "DOCTYPE ATTLIST name after state".
1281     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1282     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1283     } else {
1284     !!!cp (95);
1285     ## XML5: "Tag attribute name before state".
1286     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1287     }
1288 wakaba 1.1 !!!next-input-character;
1289     redo A;
1290     } elsif ($self->{nc} == 0x0026) { # &
1291     !!!cp (96);
1292 wakaba 1.11 ## XML5: Not defined yet.
1293    
1294 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1295     ## "entity in attribute value state". In this implementation, the
1296     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1297     ## implementation of the "consume a character reference" algorithm.
1298     $self->{prev_state} = $self->{state};
1299     $self->{entity_add} = 0x0022; # "
1300     $self->{state} = ENTITY_STATE;
1301     !!!next-input-character;
1302     redo A;
1303     } elsif ($self->{nc} == -1) {
1304     !!!parse-error (type => 'unclosed attribute value');
1305     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1306     !!!cp (97);
1307     $self->{last_stag_name} = $self->{ct}->{tag_name};
1308 wakaba 1.15
1309     $self->{state} = DATA_STATE;
1310     $self->{s_kwd} = '';
1311     ## reconsume
1312     !!!emit ($self->{ct}); # start tag
1313     redo A;
1314 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1315     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1316     if ($self->{ct}->{attributes}) {
1317     !!!cp (98);
1318     !!!parse-error (type => 'end tag attribute');
1319     } else {
1320     ## NOTE: This state should never be reached.
1321     !!!cp (99);
1322     }
1323 wakaba 1.15
1324     $self->{state} = DATA_STATE;
1325     $self->{s_kwd} = '';
1326     ## reconsume
1327     !!!emit ($self->{ct}); # end tag
1328     redo A;
1329     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1330     ## XML5: No parse error above; not defined yet.
1331     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1332     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1333     ## Reconsume.
1334     !!!emit ($self->{ct}); # ATTLIST
1335     redo A;
1336 wakaba 1.1 } else {
1337     die "$0: $self->{ct}->{type}: Unknown token type";
1338     }
1339     } else {
1340 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1341 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1342     !!!cp (100);
1343     ## XML5: Not a parse error.
1344     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1345     } else {
1346     !!!cp (100.1);
1347     }
1348 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1349     $self->{read_until}->($self->{ca}->{value},
1350 wakaba 1.11 q["&<],
1351 wakaba 1.1 length $self->{ca}->{value});
1352    
1353     ## Stay in the state
1354     !!!next-input-character;
1355     redo A;
1356     }
1357     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1358 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1359     ## ATTLIST attribute value single quoted state".
1360 wakaba 1.11
1361 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1362 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1363     !!!cp (101.1);
1364     ## XML5: "DOCTYPE ATTLIST name after state".
1365     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1366     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1367     } else {
1368     !!!cp (101);
1369     ## XML5: "Before attribute name state" (sic).
1370     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1371     }
1372 wakaba 1.1 !!!next-input-character;
1373     redo A;
1374     } elsif ($self->{nc} == 0x0026) { # &
1375     !!!cp (102);
1376 wakaba 1.11 ## XML5: Not defined yet.
1377    
1378 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1379     ## "entity in attribute value state". In this implementation, the
1380     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1381     ## implementation of the "consume a character reference" algorithm.
1382     $self->{entity_add} = 0x0027; # '
1383     $self->{prev_state} = $self->{state};
1384     $self->{state} = ENTITY_STATE;
1385     !!!next-input-character;
1386     redo A;
1387     } elsif ($self->{nc} == -1) {
1388     !!!parse-error (type => 'unclosed attribute value');
1389     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1390     !!!cp (103);
1391     $self->{last_stag_name} = $self->{ct}->{tag_name};
1392 wakaba 1.15
1393     $self->{state} = DATA_STATE;
1394     $self->{s_kwd} = '';
1395     ## reconsume
1396     !!!emit ($self->{ct}); # start tag
1397     redo A;
1398 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1399     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1400     if ($self->{ct}->{attributes}) {
1401     !!!cp (104);
1402     !!!parse-error (type => 'end tag attribute');
1403     } else {
1404     ## NOTE: This state should never be reached.
1405     !!!cp (105);
1406     }
1407 wakaba 1.15
1408     $self->{state} = DATA_STATE;
1409     $self->{s_kwd} = '';
1410     ## reconsume
1411     !!!emit ($self->{ct}); # end tag
1412     redo A;
1413     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1414     ## XML5: No parse error above; not defined yet.
1415     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1416     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1417     ## Reconsume.
1418     !!!emit ($self->{ct}); # ATTLIST
1419     redo A;
1420 wakaba 1.1 } else {
1421     die "$0: $self->{ct}->{type}: Unknown token type";
1422     }
1423     } else {
1424 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1425 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1426     !!!cp (106);
1427     ## XML5: Not a parse error.
1428     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1429     } else {
1430     !!!cp (106.1);
1431     }
1432 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1433     $self->{read_until}->($self->{ca}->{value},
1434 wakaba 1.11 q['&<],
1435 wakaba 1.1 length $self->{ca}->{value});
1436    
1437     ## Stay in the state
1438     !!!next-input-character;
1439     redo A;
1440     }
1441     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1442 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1443    
1444 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1445 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1446     !!!cp (107.1);
1447     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1448     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1449     } else {
1450     !!!cp (107);
1451     ## XML5: "Tag attribute name before state".
1452     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1453     }
1454 wakaba 1.1 !!!next-input-character;
1455     redo A;
1456     } elsif ($self->{nc} == 0x0026) { # &
1457     !!!cp (108);
1458 wakaba 1.11
1459     ## XML5: Not defined yet.
1460    
1461 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1462     ## "entity in attribute value state". In this implementation, the
1463     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1464     ## implementation of the "consume a character reference" algorithm.
1465     $self->{entity_add} = -1;
1466     $self->{prev_state} = $self->{state};
1467     $self->{state} = ENTITY_STATE;
1468     !!!next-input-character;
1469     redo A;
1470     } elsif ($self->{nc} == 0x003E) { # >
1471     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1472     !!!cp (109);
1473     $self->{last_stag_name} = $self->{ct}->{tag_name};
1474 wakaba 1.15
1475     $self->{state} = DATA_STATE;
1476     $self->{s_kwd} = '';
1477     !!!next-input-character;
1478     !!!emit ($self->{ct}); # start tag
1479     redo A;
1480 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1481     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1482     if ($self->{ct}->{attributes}) {
1483     !!!cp (110);
1484     !!!parse-error (type => 'end tag attribute');
1485     } else {
1486     ## NOTE: This state should never be reached.
1487     !!!cp (111);
1488     }
1489 wakaba 1.15
1490     $self->{state} = DATA_STATE;
1491     $self->{s_kwd} = '';
1492     !!!next-input-character;
1493     !!!emit ($self->{ct}); # end tag
1494     redo A;
1495     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1496     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1498     !!!next-input-character;
1499     !!!emit ($self->{ct}); # ATTLIST
1500     redo A;
1501 wakaba 1.1 } else {
1502     die "$0: $self->{ct}->{type}: Unknown token type";
1503     }
1504     } elsif ($self->{nc} == -1) {
1505     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1506     !!!cp (112);
1507 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1508 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1509 wakaba 1.15
1510     $self->{state} = DATA_STATE;
1511     $self->{s_kwd} = '';
1512     ## reconsume
1513     !!!emit ($self->{ct}); # start tag
1514     redo A;
1515 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1516 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1517 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1518     if ($self->{ct}->{attributes}) {
1519     !!!cp (113);
1520     !!!parse-error (type => 'end tag attribute');
1521     } else {
1522     ## NOTE: This state should never be reached.
1523     !!!cp (114);
1524     }
1525 wakaba 1.15
1526     $self->{state} = DATA_STATE;
1527     $self->{s_kwd} = '';
1528     ## reconsume
1529     !!!emit ($self->{ct}); # end tag
1530     redo A;
1531     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1532     !!!parse-error (type => 'unclosed md'); ## TODO: type
1533     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1534     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1535     ## Reconsume.
1536     !!!emit ($self->{ct}); # ATTLIST
1537     redo A;
1538 wakaba 1.1 } else {
1539     die "$0: $self->{ct}->{type}: Unknown token type";
1540     }
1541     } else {
1542     if ({
1543     0x0022 => 1, # "
1544     0x0027 => 1, # '
1545     0x003D => 1, # =
1546     }->{$self->{nc}}) {
1547     !!!cp (115);
1548 wakaba 1.11 ## XML5: Not a parse error.
1549 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1550     } else {
1551     !!!cp (116);
1552     }
1553     $self->{ca}->{value} .= chr ($self->{nc});
1554     $self->{read_until}->($self->{ca}->{value},
1555     q["'=& >],
1556     length $self->{ca}->{value});
1557    
1558     ## Stay in the state
1559     !!!next-input-character;
1560     redo A;
1561     }
1562     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1563     if ($is_space->{$self->{nc}}) {
1564     !!!cp (118);
1565     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1566     !!!next-input-character;
1567     redo A;
1568     } elsif ($self->{nc} == 0x003E) { # >
1569     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1570     !!!cp (119);
1571     $self->{last_stag_name} = $self->{ct}->{tag_name};
1572     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1573     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1574     if ($self->{ct}->{attributes}) {
1575     !!!cp (120);
1576     !!!parse-error (type => 'end tag attribute');
1577     } else {
1578     ## NOTE: This state should never be reached.
1579     !!!cp (121);
1580     }
1581     } else {
1582     die "$0: $self->{ct}->{type}: Unknown token type";
1583     }
1584     $self->{state} = DATA_STATE;
1585 wakaba 1.5 $self->{s_kwd} = '';
1586 wakaba 1.1 !!!next-input-character;
1587    
1588     !!!emit ($self->{ct}); # start tag or end tag
1589    
1590     redo A;
1591     } elsif ($self->{nc} == 0x002F) { # /
1592     !!!cp (122);
1593     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1594     !!!next-input-character;
1595     redo A;
1596     } elsif ($self->{nc} == -1) {
1597     !!!parse-error (type => 'unclosed tag');
1598     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1599     !!!cp (122.3);
1600     $self->{last_stag_name} = $self->{ct}->{tag_name};
1601     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1602     if ($self->{ct}->{attributes}) {
1603     !!!cp (122.1);
1604     !!!parse-error (type => 'end tag attribute');
1605     } else {
1606     ## NOTE: This state should never be reached.
1607     !!!cp (122.2);
1608     }
1609     } else {
1610     die "$0: $self->{ct}->{type}: Unknown token type";
1611     }
1612     $self->{state} = DATA_STATE;
1613 wakaba 1.5 $self->{s_kwd} = '';
1614 wakaba 1.1 ## Reconsume.
1615     !!!emit ($self->{ct}); # start tag or end tag
1616     redo A;
1617     } else {
1618     !!!cp ('124.1');
1619     !!!parse-error (type => 'no space between attributes');
1620     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1621     ## reconsume
1622     redo A;
1623     }
1624     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1625 wakaba 1.11 ## XML5: "Empty tag state".
1626    
1627 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1628     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1629     !!!cp ('124.2');
1630     !!!parse-error (type => 'nestc', token => $self->{ct});
1631     ## TODO: Different type than slash in start tag
1632     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp ('124.4');
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     !!!cp ('124.5');
1638     }
1639     ## TODO: Test |<title></title/>|
1640     } else {
1641     !!!cp ('124.3');
1642     $self->{self_closing} = 1;
1643     }
1644    
1645     $self->{state} = DATA_STATE;
1646 wakaba 1.5 $self->{s_kwd} = '';
1647 wakaba 1.1 !!!next-input-character;
1648    
1649     !!!emit ($self->{ct}); # start tag or end tag
1650    
1651     redo A;
1652     } elsif ($self->{nc} == -1) {
1653     !!!parse-error (type => 'unclosed tag');
1654     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1655     !!!cp (124.7);
1656     $self->{last_stag_name} = $self->{ct}->{tag_name};
1657     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1658     if ($self->{ct}->{attributes}) {
1659     !!!cp (124.5);
1660     !!!parse-error (type => 'end tag attribute');
1661     } else {
1662     ## NOTE: This state should never be reached.
1663     !!!cp (124.6);
1664     }
1665     } else {
1666     die "$0: $self->{ct}->{type}: Unknown token type";
1667     }
1668 wakaba 1.11 ## XML5: "Tag attribute name before state".
1669 wakaba 1.1 $self->{state} = DATA_STATE;
1670 wakaba 1.5 $self->{s_kwd} = '';
1671 wakaba 1.1 ## Reconsume.
1672     !!!emit ($self->{ct}); # start tag or end tag
1673     redo A;
1674     } else {
1675     !!!cp ('124.4');
1676     !!!parse-error (type => 'nestc');
1677     ## TODO: This error type is wrong.
1678     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1679     ## Reconsume.
1680     redo A;
1681     }
1682     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1683 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1684    
1685 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1686     ## consumes characters one-by-one basis.
1687    
1688     if ($self->{nc} == 0x003E) { # >
1689 wakaba 1.13 if ($self->{in_subset}) {
1690     !!!cp (123);
1691     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1692     } else {
1693     !!!cp (124);
1694     $self->{state} = DATA_STATE;
1695     $self->{s_kwd} = '';
1696     }
1697 wakaba 1.1 !!!next-input-character;
1698    
1699     !!!emit ($self->{ct}); # comment
1700     redo A;
1701     } elsif ($self->{nc} == -1) {
1702 wakaba 1.13 if ($self->{in_subset}) {
1703     !!!cp (125.1);
1704     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1705     } else {
1706     !!!cp (125);
1707     $self->{state} = DATA_STATE;
1708     $self->{s_kwd} = '';
1709     }
1710 wakaba 1.1 ## reconsume
1711    
1712     !!!emit ($self->{ct}); # comment
1713     redo A;
1714     } else {
1715     !!!cp (126);
1716     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1717     $self->{read_until}->($self->{ct}->{data},
1718     q[>],
1719     length $self->{ct}->{data});
1720    
1721     ## Stay in the state.
1722     !!!next-input-character;
1723     redo A;
1724     }
1725     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1726 wakaba 1.14 ## XML5: "Markup declaration state".
1727 wakaba 1.1
1728     if ($self->{nc} == 0x002D) { # -
1729     !!!cp (133);
1730     $self->{state} = MD_HYPHEN_STATE;
1731     !!!next-input-character;
1732     redo A;
1733     } elsif ($self->{nc} == 0x0044 or # D
1734     $self->{nc} == 0x0064) { # d
1735     ## ASCII case-insensitive.
1736     !!!cp (130);
1737     $self->{state} = MD_DOCTYPE_STATE;
1738 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1739 wakaba 1.1 !!!next-input-character;
1740     redo A;
1741 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1742     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1743     $self->{is_xml}) and
1744 wakaba 1.1 $self->{nc} == 0x005B) { # [
1745     !!!cp (135.4);
1746     $self->{state} = MD_CDATA_STATE;
1747 wakaba 1.12 $self->{kwd} = '[';
1748 wakaba 1.1 !!!next-input-character;
1749     redo A;
1750     } else {
1751     !!!cp (136);
1752     }
1753    
1754     !!!parse-error (type => 'bogus comment',
1755     line => $self->{line_prev},
1756     column => $self->{column_prev} - 1);
1757     ## Reconsume.
1758     $self->{state} = BOGUS_COMMENT_STATE;
1759     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1760     line => $self->{line_prev},
1761     column => $self->{column_prev} - 1,
1762     };
1763     redo A;
1764     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1765     if ($self->{nc} == 0x002D) { # -
1766     !!!cp (127);
1767     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1768     line => $self->{line_prev},
1769     column => $self->{column_prev} - 2,
1770     };
1771 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1772 wakaba 1.1 !!!next-input-character;
1773     redo A;
1774     } else {
1775     !!!cp (128);
1776     !!!parse-error (type => 'bogus comment',
1777     line => $self->{line_prev},
1778     column => $self->{column_prev} - 2);
1779     $self->{state} = BOGUS_COMMENT_STATE;
1780     ## Reconsume.
1781     $self->{ct} = {type => COMMENT_TOKEN,
1782     data => '-',
1783     line => $self->{line_prev},
1784     column => $self->{column_prev} - 2,
1785     };
1786     redo A;
1787     }
1788     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1789     ## ASCII case-insensitive.
1790     if ($self->{nc} == [
1791     undef,
1792     0x004F, # O
1793     0x0043, # C
1794     0x0054, # T
1795     0x0059, # Y
1796     0x0050, # P
1797 wakaba 1.12 ]->[length $self->{kwd}] or
1798 wakaba 1.1 $self->{nc} == [
1799     undef,
1800     0x006F, # o
1801     0x0063, # c
1802     0x0074, # t
1803     0x0079, # y
1804     0x0070, # p
1805 wakaba 1.12 ]->[length $self->{kwd}]) {
1806 wakaba 1.1 !!!cp (131);
1807     ## Stay in the state.
1808 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1809 wakaba 1.1 !!!next-input-character;
1810     redo A;
1811 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1812 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1813     $self->{nc} == 0x0065)) { # e
1814 wakaba 1.12 if ($self->{is_xml} and
1815     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1816 wakaba 1.10 !!!cp (129);
1817     ## XML5: case-sensitive.
1818     !!!parse-error (type => 'lowercase keyword', ## TODO
1819     text => 'DOCTYPE',
1820     line => $self->{line_prev},
1821     column => $self->{column_prev} - 5);
1822     } else {
1823     !!!cp (129.1);
1824     }
1825 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1826     $self->{ct} = {type => DOCTYPE_TOKEN,
1827     quirks => 1,
1828     line => $self->{line_prev},
1829     column => $self->{column_prev} - 7,
1830     };
1831     !!!next-input-character;
1832     redo A;
1833     } else {
1834     !!!cp (132);
1835     !!!parse-error (type => 'bogus comment',
1836     line => $self->{line_prev},
1837 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1838 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1839     ## Reconsume.
1840     $self->{ct} = {type => COMMENT_TOKEN,
1841 wakaba 1.12 data => $self->{kwd},
1842 wakaba 1.1 line => $self->{line_prev},
1843 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1844 wakaba 1.1 };
1845     redo A;
1846     }
1847     } elsif ($self->{state} == MD_CDATA_STATE) {
1848     if ($self->{nc} == {
1849     '[' => 0x0043, # C
1850     '[C' => 0x0044, # D
1851     '[CD' => 0x0041, # A
1852     '[CDA' => 0x0054, # T
1853     '[CDAT' => 0x0041, # A
1854 wakaba 1.12 }->{$self->{kwd}}) {
1855 wakaba 1.1 !!!cp (135.1);
1856     ## Stay in the state.
1857 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1858 wakaba 1.1 !!!next-input-character;
1859     redo A;
1860 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1861 wakaba 1.1 $self->{nc} == 0x005B) { # [
1862 wakaba 1.6 if ($self->{is_xml} and
1863     not $self->{tainted} and
1864     @{$self->{open_elements} or []} == 0) {
1865 wakaba 1.8 !!!cp (135.2);
1866 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1867     line => $self->{line_prev},
1868     column => $self->{column_prev} - 7);
1869     $self->{tainted} = 1;
1870 wakaba 1.8 } else {
1871     !!!cp (135.21);
1872 wakaba 1.6 }
1873    
1874 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1875     data => '',
1876     line => $self->{line_prev},
1877     column => $self->{column_prev} - 7};
1878     $self->{state} = CDATA_SECTION_STATE;
1879     !!!next-input-character;
1880     redo A;
1881     } else {
1882     !!!cp (135.3);
1883     !!!parse-error (type => 'bogus comment',
1884     line => $self->{line_prev},
1885 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1886 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1887     ## Reconsume.
1888     $self->{ct} = {type => COMMENT_TOKEN,
1889 wakaba 1.12 data => $self->{kwd},
1890 wakaba 1.1 line => $self->{line_prev},
1891 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1892 wakaba 1.1 };
1893     redo A;
1894     }
1895     } elsif ($self->{state} == COMMENT_START_STATE) {
1896     if ($self->{nc} == 0x002D) { # -
1897     !!!cp (137);
1898     $self->{state} = COMMENT_START_DASH_STATE;
1899     !!!next-input-character;
1900     redo A;
1901     } elsif ($self->{nc} == 0x003E) { # >
1902     !!!parse-error (type => 'bogus comment');
1903 wakaba 1.13 if ($self->{in_subset}) {
1904     !!!cp (138.1);
1905     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1906     } else {
1907     !!!cp (138);
1908     $self->{state} = DATA_STATE;
1909     $self->{s_kwd} = '';
1910     }
1911 wakaba 1.1 !!!next-input-character;
1912    
1913     !!!emit ($self->{ct}); # comment
1914    
1915     redo A;
1916     } elsif ($self->{nc} == -1) {
1917     !!!parse-error (type => 'unclosed comment');
1918 wakaba 1.13 if ($self->{in_subset}) {
1919     !!!cp (139.1);
1920     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1921     } else {
1922     !!!cp (139);
1923     $self->{state} = DATA_STATE;
1924     $self->{s_kwd} = '';
1925     }
1926 wakaba 1.1 ## reconsume
1927    
1928     !!!emit ($self->{ct}); # comment
1929    
1930     redo A;
1931     } else {
1932     !!!cp (140);
1933     $self->{ct}->{data} # comment
1934     .= chr ($self->{nc});
1935     $self->{state} = COMMENT_STATE;
1936     !!!next-input-character;
1937     redo A;
1938     }
1939     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1940     if ($self->{nc} == 0x002D) { # -
1941     !!!cp (141);
1942     $self->{state} = COMMENT_END_STATE;
1943     !!!next-input-character;
1944     redo A;
1945     } elsif ($self->{nc} == 0x003E) { # >
1946     !!!parse-error (type => 'bogus comment');
1947 wakaba 1.13 if ($self->{in_subset}) {
1948     !!!cp (142.1);
1949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1950     } else {
1951     !!!cp (142);
1952     $self->{state} = DATA_STATE;
1953     $self->{s_kwd} = '';
1954     }
1955 wakaba 1.1 !!!next-input-character;
1956    
1957     !!!emit ($self->{ct}); # comment
1958    
1959     redo A;
1960     } elsif ($self->{nc} == -1) {
1961     !!!parse-error (type => 'unclosed comment');
1962 wakaba 1.13 if ($self->{in_subset}) {
1963     !!!cp (143.1);
1964     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1965     } else {
1966     !!!cp (143);
1967     $self->{state} = DATA_STATE;
1968     $self->{s_kwd} = '';
1969     }
1970 wakaba 1.1 ## reconsume
1971    
1972     !!!emit ($self->{ct}); # comment
1973    
1974     redo A;
1975     } else {
1976     !!!cp (144);
1977     $self->{ct}->{data} # comment
1978     .= '-' . chr ($self->{nc});
1979     $self->{state} = COMMENT_STATE;
1980     !!!next-input-character;
1981     redo A;
1982     }
1983     } elsif ($self->{state} == COMMENT_STATE) {
1984 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
1985    
1986 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1987     !!!cp (145);
1988     $self->{state} = COMMENT_END_DASH_STATE;
1989     !!!next-input-character;
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (146.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (146);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (147);
2008     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2009     $self->{read_until}->($self->{ct}->{data},
2010     q[-],
2011     length $self->{ct}->{data});
2012    
2013     ## Stay in the state
2014     !!!next-input-character;
2015     redo A;
2016     }
2017     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2018 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2019 wakaba 1.10
2020 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2021     !!!cp (148);
2022     $self->{state} = COMMENT_END_STATE;
2023     !!!next-input-character;
2024     redo A;
2025     } elsif ($self->{nc} == -1) {
2026     !!!parse-error (type => 'unclosed comment');
2027 wakaba 1.13 if ($self->{in_subset}) {
2028     !!!cp (149.1);
2029     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030     } else {
2031     !!!cp (149);
2032     $self->{state} = DATA_STATE;
2033     $self->{s_kwd} = '';
2034     }
2035 wakaba 1.1 ## reconsume
2036    
2037     !!!emit ($self->{ct}); # comment
2038    
2039     redo A;
2040     } else {
2041     !!!cp (150);
2042     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2043     $self->{state} = COMMENT_STATE;
2044     !!!next-input-character;
2045     redo A;
2046     }
2047     } elsif ($self->{state} == COMMENT_END_STATE) {
2048 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2049    
2050 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2051 wakaba 1.13 if ($self->{in_subset}) {
2052     !!!cp (151.1);
2053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2054     } else {
2055     !!!cp (151);
2056     $self->{state} = DATA_STATE;
2057     $self->{s_kwd} = '';
2058     }
2059 wakaba 1.1 !!!next-input-character;
2060    
2061     !!!emit ($self->{ct}); # comment
2062    
2063     redo A;
2064     } elsif ($self->{nc} == 0x002D) { # -
2065     !!!cp (152);
2066 wakaba 1.10 ## XML5: Not a parse error.
2067 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2068     line => $self->{line_prev},
2069     column => $self->{column_prev});
2070     $self->{ct}->{data} .= '-'; # comment
2071     ## Stay in the state
2072     !!!next-input-character;
2073     redo A;
2074     } elsif ($self->{nc} == -1) {
2075     !!!parse-error (type => 'unclosed comment');
2076 wakaba 1.13 if ($self->{in_subset}) {
2077     !!!cp (153.1);
2078     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2079     } else {
2080     !!!cp (153);
2081     $self->{state} = DATA_STATE;
2082     $self->{s_kwd} = '';
2083     }
2084 wakaba 1.1 ## reconsume
2085    
2086     !!!emit ($self->{ct}); # comment
2087    
2088     redo A;
2089     } else {
2090     !!!cp (154);
2091 wakaba 1.10 ## XML5: Not a parse error.
2092 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2093     line => $self->{line_prev},
2094     column => $self->{column_prev});
2095     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2096     $self->{state} = COMMENT_STATE;
2097     !!!next-input-character;
2098     redo A;
2099     }
2100     } elsif ($self->{state} == DOCTYPE_STATE) {
2101     if ($is_space->{$self->{nc}}) {
2102     !!!cp (155);
2103     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2104     !!!next-input-character;
2105     redo A;
2106     } else {
2107     !!!cp (156);
2108 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2109 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2110     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2111     ## reconsume
2112     redo A;
2113     }
2114     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2115 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2116    
2117 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2118     !!!cp (157);
2119     ## Stay in the state
2120     !!!next-input-character;
2121     redo A;
2122     } elsif ($self->{nc} == 0x003E) { # >
2123     !!!cp (158);
2124 wakaba 1.12 ## XML5: No parse error.
2125 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2126     $self->{state} = DATA_STATE;
2127 wakaba 1.5 $self->{s_kwd} = '';
2128 wakaba 1.1 !!!next-input-character;
2129    
2130     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2131    
2132     redo A;
2133     } elsif ($self->{nc} == -1) {
2134     !!!cp (159);
2135     !!!parse-error (type => 'no DOCTYPE name');
2136     $self->{state} = DATA_STATE;
2137 wakaba 1.5 $self->{s_kwd} = '';
2138 wakaba 1.1 ## reconsume
2139    
2140     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141    
2142     redo A;
2143 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2144     !!!cp (159.1);
2145     !!!parse-error (type => 'no DOCTYPE name');
2146     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2147 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2148     $self->{in_subset} = 1;
2149 wakaba 1.12 !!!next-input-character;
2150 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2151 wakaba 1.12 redo A;
2152 wakaba 1.1 } else {
2153     !!!cp (160);
2154     $self->{ct}->{name} = chr $self->{nc};
2155     delete $self->{ct}->{quirks};
2156     $self->{state} = DOCTYPE_NAME_STATE;
2157     !!!next-input-character;
2158     redo A;
2159     }
2160     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2161 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2162    
2163     ## ISSUE: Redundant "First," in the spec.
2164    
2165 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2166     !!!cp (161);
2167     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2168     !!!next-input-character;
2169     redo A;
2170     } elsif ($self->{nc} == 0x003E) { # >
2171     !!!cp (162);
2172     $self->{state} = DATA_STATE;
2173 wakaba 1.5 $self->{s_kwd} = '';
2174 wakaba 1.1 !!!next-input-character;
2175    
2176     !!!emit ($self->{ct}); # DOCTYPE
2177    
2178     redo A;
2179     } elsif ($self->{nc} == -1) {
2180     !!!cp (163);
2181     !!!parse-error (type => 'unclosed DOCTYPE');
2182     $self->{state} = DATA_STATE;
2183 wakaba 1.5 $self->{s_kwd} = '';
2184 wakaba 1.1 ## reconsume
2185    
2186     $self->{ct}->{quirks} = 1;
2187     !!!emit ($self->{ct}); # DOCTYPE
2188    
2189     redo A;
2190 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2191     !!!cp (163.1);
2192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2193 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2194     $self->{in_subset} = 1;
2195 wakaba 1.12 !!!next-input-character;
2196 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2197 wakaba 1.12 redo A;
2198 wakaba 1.1 } else {
2199     !!!cp (164);
2200     $self->{ct}->{name}
2201     .= chr ($self->{nc}); # DOCTYPE
2202     ## Stay in the state
2203     !!!next-input-character;
2204     redo A;
2205     }
2206     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2207 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2208     ## state", but implemented differently.
2209    
2210 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2211     !!!cp (165);
2212     ## Stay in the state
2213     !!!next-input-character;
2214     redo A;
2215     } elsif ($self->{nc} == 0x003E) { # >
2216 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2217     !!!cp (166);
2218     $self->{state} = DATA_STATE;
2219     $self->{s_kwd} = '';
2220     } else {
2221     !!!cp (166.1);
2222     !!!parse-error (type => 'no md def'); ## TODO: type
2223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2224     }
2225    
2226 wakaba 1.1 !!!next-input-character;
2227 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2228 wakaba 1.1 redo A;
2229     } elsif ($self->{nc} == -1) {
2230 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2231     !!!cp (167);
2232     !!!parse-error (type => 'unclosed DOCTYPE');
2233     $self->{state} = DATA_STATE;
2234     $self->{s_kwd} = '';
2235     $self->{ct}->{quirks} = 1;
2236     } else {
2237     !!!cp (167.12);
2238     !!!parse-error (type => 'unclosed md'); ## TODO: type
2239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240     }
2241    
2242     ## Reconsume.
2243     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2244 wakaba 1.1 redo A;
2245     } elsif ($self->{nc} == 0x0050 or # P
2246     $self->{nc} == 0x0070) { # p
2247 wakaba 1.12 !!!cp (167.1);
2248 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2249 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2250 wakaba 1.1 !!!next-input-character;
2251     redo A;
2252     } elsif ($self->{nc} == 0x0053 or # S
2253     $self->{nc} == 0x0073) { # s
2254 wakaba 1.12 !!!cp (167.2);
2255 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2256 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2257     !!!next-input-character;
2258     redo A;
2259 wakaba 1.16 ## TODO: " and ' for ENTITY
2260     } elsif ($self->{is_xml} and
2261     $self->{ct}->{type} == DOCTYPE_TOKEN and
2262     $self->{nc} == 0x005B) { # [
2263 wakaba 1.12 !!!cp (167.3);
2264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2265     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2266 wakaba 1.13 $self->{in_subset} = 1;
2267 wakaba 1.1 !!!next-input-character;
2268 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2269 wakaba 1.1 redo A;
2270     } else {
2271 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2272    
2273     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2274     !!!cp (180);
2275     $self->{ct}->{quirks} = 1;
2276     $self->{state} = BOGUS_DOCTYPE_STATE;
2277     } else {
2278     !!!cp (180.1);
2279     $self->{state} = BOGUS_MD_STATE;
2280     }
2281 wakaba 1.1
2282     !!!next-input-character;
2283     redo A;
2284     }
2285     } elsif ($self->{state} == PUBLIC_STATE) {
2286     ## ASCII case-insensitive
2287     if ($self->{nc} == [
2288     undef,
2289     0x0055, # U
2290     0x0042, # B
2291     0x004C, # L
2292     0x0049, # I
2293 wakaba 1.12 ]->[length $self->{kwd}] or
2294 wakaba 1.1 $self->{nc} == [
2295     undef,
2296     0x0075, # u
2297     0x0062, # b
2298     0x006C, # l
2299     0x0069, # i
2300 wakaba 1.12 ]->[length $self->{kwd}]) {
2301 wakaba 1.1 !!!cp (175);
2302     ## Stay in the state.
2303 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2304 wakaba 1.1 !!!next-input-character;
2305     redo A;
2306 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2307 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2308     $self->{nc} == 0x0063)) { # c
2309 wakaba 1.12 if ($self->{is_xml} and
2310     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2311     !!!cp (168.1);
2312     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2313     text => 'PUBLIC',
2314     line => $self->{line_prev},
2315     column => $self->{column_prev} - 4);
2316     } else {
2317     !!!cp (168);
2318     }
2319 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2320     !!!next-input-character;
2321     redo A;
2322     } else {
2323 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2324 wakaba 1.1 line => $self->{line_prev},
2325 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2327     !!!cp (169);
2328     $self->{ct}->{quirks} = 1;
2329     $self->{state} = BOGUS_DOCTYPE_STATE;
2330     } else {
2331     !!!cp (169.1);
2332     $self->{state} = BOGUS_MD_STATE;
2333     }
2334 wakaba 1.1 ## Reconsume.
2335     redo A;
2336     }
2337     } elsif ($self->{state} == SYSTEM_STATE) {
2338     ## ASCII case-insensitive
2339     if ($self->{nc} == [
2340     undef,
2341     0x0059, # Y
2342     0x0053, # S
2343     0x0054, # T
2344     0x0045, # E
2345 wakaba 1.12 ]->[length $self->{kwd}] or
2346 wakaba 1.1 $self->{nc} == [
2347     undef,
2348     0x0079, # y
2349     0x0073, # s
2350     0x0074, # t
2351     0x0065, # e
2352 wakaba 1.12 ]->[length $self->{kwd}]) {
2353 wakaba 1.1 !!!cp (170);
2354     ## Stay in the state.
2355 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2356 wakaba 1.1 !!!next-input-character;
2357     redo A;
2358 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2359 wakaba 1.1 ($self->{nc} == 0x004D or # M
2360     $self->{nc} == 0x006D)) { # m
2361 wakaba 1.12 if ($self->{is_xml} and
2362     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2363     !!!cp (171.1);
2364     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2365     text => 'SYSTEM',
2366     line => $self->{line_prev},
2367     column => $self->{column_prev} - 4);
2368     } else {
2369     !!!cp (171);
2370     }
2371 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2372     !!!next-input-character;
2373     redo A;
2374     } else {
2375 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2376 wakaba 1.1 line => $self->{line_prev},
2377 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2378 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2379     !!!cp (172);
2380     $self->{ct}->{quirks} = 1;
2381     $self->{state} = BOGUS_DOCTYPE_STATE;
2382     } else {
2383     !!!cp (172.1);
2384     $self->{state} = BOGUS_MD_STATE;
2385     }
2386 wakaba 1.1 ## Reconsume.
2387     redo A;
2388     }
2389     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2390     if ($is_space->{$self->{nc}}) {
2391     !!!cp (181);
2392     ## Stay in the state
2393     !!!next-input-character;
2394     redo A;
2395     } elsif ($self->{nc} eq 0x0022) { # "
2396     !!!cp (182);
2397     $self->{ct}->{pubid} = ''; # DOCTYPE
2398     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2399     !!!next-input-character;
2400     redo A;
2401     } elsif ($self->{nc} eq 0x0027) { # '
2402     !!!cp (183);
2403     $self->{ct}->{pubid} = ''; # DOCTYPE
2404     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2405     !!!next-input-character;
2406     redo A;
2407     } elsif ($self->{nc} eq 0x003E) { # >
2408     !!!parse-error (type => 'no PUBLIC literal');
2409 wakaba 1.16
2410     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2411     !!!cp (184);
2412     $self->{state} = DATA_STATE;
2413     $self->{s_kwd} = '';
2414     $self->{ct}->{quirks} = 1;
2415     } else {
2416     !!!cp (184.1);
2417     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2418     }
2419    
2420 wakaba 1.1 !!!next-input-character;
2421 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2422 wakaba 1.1 redo A;
2423     } elsif ($self->{nc} == -1) {
2424 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2425     !!!cp (185);
2426     !!!parse-error (type => 'unclosed DOCTYPE');
2427     $self->{state} = DATA_STATE;
2428     $self->{s_kwd} = '';
2429     $self->{ct}->{quirks} = 1;
2430     } else {
2431     !!!cp (185.1);
2432     !!!parse-error (type => 'unclosed md'); ## TODO: type
2433     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2434     }
2435    
2436 wakaba 1.1 ## reconsume
2437     !!!emit ($self->{ct}); # DOCTYPE
2438     redo A;
2439 wakaba 1.16 } elsif ($self->{is_xml} and
2440     $self->{ct}->{type} == DOCTYPE_TOKEN and
2441     $self->{nc} == 0x005B) { # [
2442 wakaba 1.12 !!!cp (186.1);
2443     !!!parse-error (type => 'no PUBLIC literal');
2444     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2445     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2446 wakaba 1.13 $self->{in_subset} = 1;
2447 wakaba 1.12 !!!next-input-character;
2448 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2449 wakaba 1.12 redo A;
2450 wakaba 1.1 } else {
2451     !!!parse-error (type => 'string after PUBLIC');
2452    
2453 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2454     !!!cp (186);
2455     $self->{ct}->{quirks} = 1;
2456     $self->{state} = BOGUS_DOCTYPE_STATE;
2457     } else {
2458     !!!cp (186.2);
2459     $self->{state} = BOGUS_MD_STATE;
2460     }
2461    
2462 wakaba 1.1 !!!next-input-character;
2463     redo A;
2464     }
2465     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2466     if ($self->{nc} == 0x0022) { # "
2467     !!!cp (187);
2468     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2469     !!!next-input-character;
2470     redo A;
2471     } elsif ($self->{nc} == 0x003E) { # >
2472     !!!parse-error (type => 'unclosed PUBLIC literal');
2473    
2474 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2475     !!!cp (188);
2476     $self->{state} = DATA_STATE;
2477     $self->{s_kwd} = '';
2478     $self->{ct}->{quirks} = 1;
2479     } else {
2480     !!!cp (188.1);
2481     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2482     }
2483    
2484 wakaba 1.1 !!!next-input-character;
2485 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2486 wakaba 1.1 redo A;
2487     } elsif ($self->{nc} == -1) {
2488     !!!parse-error (type => 'unclosed PUBLIC literal');
2489    
2490 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2491     !!!cp (189);
2492     $self->{state} = DATA_STATE;
2493     $self->{s_kwd} = '';
2494     $self->{ct}->{quirks} = 1;
2495     } else {
2496     !!!cp (189.1);
2497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2498     }
2499    
2500     ## Reconsume.
2501 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2502     redo A;
2503     } else {
2504     !!!cp (190);
2505 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2506 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2507     length $self->{ct}->{pubid});
2508    
2509     ## Stay in the state
2510     !!!next-input-character;
2511     redo A;
2512     }
2513     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2514     if ($self->{nc} == 0x0027) { # '
2515     !!!cp (191);
2516     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2517     !!!next-input-character;
2518     redo A;
2519     } elsif ($self->{nc} == 0x003E) { # >
2520     !!!parse-error (type => 'unclosed PUBLIC literal');
2521    
2522 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2523     !!!cp (192);
2524     $self->{state} = DATA_STATE;
2525     $self->{s_kwd} = '';
2526     $self->{ct}->{quirks} = 1;
2527     } else {
2528     !!!cp (192.1);
2529     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2530     }
2531    
2532 wakaba 1.1 !!!next-input-character;
2533 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2534 wakaba 1.1 redo A;
2535     } elsif ($self->{nc} == -1) {
2536     !!!parse-error (type => 'unclosed PUBLIC literal');
2537    
2538 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2539     !!!cp (193);
2540     $self->{state} = DATA_STATE;
2541     $self->{s_kwd} = '';
2542     $self->{ct}->{quirks} = 1;
2543     } else {
2544     !!!cp (193.1);
2545     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2546     }
2547    
2548 wakaba 1.1 ## reconsume
2549 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2550 wakaba 1.1 redo A;
2551     } else {
2552     !!!cp (194);
2553 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2554 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2555     length $self->{ct}->{pubid});
2556    
2557     ## Stay in the state
2558     !!!next-input-character;
2559     redo A;
2560     }
2561     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2562     if ($is_space->{$self->{nc}}) {
2563     !!!cp (195);
2564     ## Stay in the state
2565     !!!next-input-character;
2566     redo A;
2567     } elsif ($self->{nc} == 0x0022) { # "
2568     !!!cp (196);
2569 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2570 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2571     !!!next-input-character;
2572     redo A;
2573     } elsif ($self->{nc} == 0x0027) { # '
2574     !!!cp (197);
2575 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2577     !!!next-input-character;
2578     redo A;
2579     } elsif ($self->{nc} == 0x003E) { # >
2580 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2581     if ($self->{is_xml}) {
2582     !!!cp (198.1);
2583     !!!parse-error (type => 'no SYSTEM literal');
2584     } else {
2585     !!!cp (198);
2586     }
2587     $self->{state} = DATA_STATE;
2588     $self->{s_kwd} = '';
2589 wakaba 1.12 } else {
2590 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2591     !!!cp (198.2);
2592     } else {
2593     !!!cp (198.3);
2594     !!!parse-error (type => 'no SYSTEM literal');
2595     }
2596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597 wakaba 1.12 }
2598 wakaba 1.16
2599 wakaba 1.1 !!!next-input-character;
2600 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2601 wakaba 1.1 redo A;
2602     } elsif ($self->{nc} == -1) {
2603 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2604     !!!cp (199);
2605     !!!parse-error (type => 'unclosed DOCTYPE');
2606    
2607     $self->{state} = DATA_STATE;
2608     $self->{s_kwd} = '';
2609     $self->{ct}->{quirks} = 1;
2610     } else {
2611     !!!parse-error (type => 'unclosed md'); ## TODO: type
2612     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2613     }
2614    
2615 wakaba 1.1 ## reconsume
2616 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2617 wakaba 1.1 redo A;
2618 wakaba 1.16 } elsif ($self->{is_xml} and
2619     $self->{ct}->{type} == DOCTYPE_TOKEN and
2620     $self->{nc} == 0x005B) { # [
2621 wakaba 1.12 !!!cp (200.1);
2622     !!!parse-error (type => 'no SYSTEM literal');
2623     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2624     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2625 wakaba 1.13 $self->{in_subset} = 1;
2626 wakaba 1.12 !!!next-input-character;
2627 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2628 wakaba 1.12 redo A;
2629 wakaba 1.1 } else {
2630     !!!parse-error (type => 'string after PUBLIC literal');
2631    
2632 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2633     !!!cp (200);
2634     $self->{ct}->{quirks} = 1;
2635     $self->{state} = BOGUS_DOCTYPE_STATE;
2636     } else {
2637     !!!cp (200.2);
2638     $self->{state} = BOGUS_MD_STATE;
2639     }
2640    
2641 wakaba 1.1 !!!next-input-character;
2642     redo A;
2643     }
2644     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2645     if ($is_space->{$self->{nc}}) {
2646     !!!cp (201);
2647     ## Stay in the state
2648     !!!next-input-character;
2649     redo A;
2650     } elsif ($self->{nc} == 0x0022) { # "
2651     !!!cp (202);
2652     $self->{ct}->{sysid} = ''; # DOCTYPE
2653     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2654     !!!next-input-character;
2655     redo A;
2656     } elsif ($self->{nc} == 0x0027) { # '
2657     !!!cp (203);
2658     $self->{ct}->{sysid} = ''; # DOCTYPE
2659     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2660     !!!next-input-character;
2661     redo A;
2662     } elsif ($self->{nc} == 0x003E) { # >
2663     !!!parse-error (type => 'no SYSTEM literal');
2664     !!!next-input-character;
2665    
2666 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2667     !!!cp (204);
2668     $self->{state} = DATA_STATE;
2669     $self->{s_kwd} = '';
2670     $self->{ct}->{quirks} = 1;
2671     } else {
2672     !!!cp (204.1);
2673     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2674     }
2675 wakaba 1.1
2676 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2677 wakaba 1.1 redo A;
2678     } elsif ($self->{nc} == -1) {
2679 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2680     !!!cp (205);
2681     !!!parse-error (type => 'unclosed DOCTYPE');
2682     $self->{state} = DATA_STATE;
2683     $self->{s_kwd} = '';
2684     $self->{ct}->{quirks} = 1;
2685     } else {
2686     !!!cp (205.1);
2687     !!!parse-error (type => 'unclosed md'); ## TODO: type
2688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2689     }
2690    
2691 wakaba 1.1 ## reconsume
2692 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2693 wakaba 1.1 redo A;
2694 wakaba 1.16 } elsif ($self->{is_xml} and
2695     $self->{ct}->{type} == DOCTYPE_TOKEN and
2696     $self->{nc} == 0x005B) { # [
2697 wakaba 1.12 !!!cp (206.1);
2698     !!!parse-error (type => 'no SYSTEM literal');
2699    
2700     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2701     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2702 wakaba 1.13 $self->{in_subset} = 1;
2703 wakaba 1.12 !!!next-input-character;
2704 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2705 wakaba 1.12 redo A;
2706 wakaba 1.1 } else {
2707     !!!parse-error (type => 'string after SYSTEM');
2708    
2709 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2710     !!!cp (206);
2711     $self->{ct}->{quirks} = 1;
2712     $self->{state} = BOGUS_DOCTYPE_STATE;
2713     } else {
2714     !!!cp (206.2);
2715     $self->{state} = BOGUS_MD_STATE;
2716     }
2717    
2718 wakaba 1.1 !!!next-input-character;
2719     redo A;
2720     }
2721     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2722     if ($self->{nc} == 0x0022) { # "
2723     !!!cp (207);
2724     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2725     !!!next-input-character;
2726     redo A;
2727 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2728 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2729    
2730 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2731     !!!cp (208);
2732     $self->{state} = DATA_STATE;
2733     $self->{s_kwd} = '';
2734     $self->{ct}->{quirks} = 1;
2735     } else {
2736     !!!cp (208.1);
2737     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2738     }
2739    
2740 wakaba 1.1 !!!next-input-character;
2741 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2742 wakaba 1.1 redo A;
2743     } elsif ($self->{nc} == -1) {
2744     !!!parse-error (type => 'unclosed SYSTEM literal');
2745    
2746 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2747     !!!cp (209);
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     $self->{ct}->{quirks} = 1;
2751     } else {
2752     !!!cp (209.1);
2753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2754     }
2755    
2756 wakaba 1.1 ## reconsume
2757 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2758 wakaba 1.1 redo A;
2759     } else {
2760     !!!cp (210);
2761 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2762 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2763     length $self->{ct}->{sysid});
2764    
2765     ## Stay in the state
2766     !!!next-input-character;
2767     redo A;
2768     }
2769     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2770     if ($self->{nc} == 0x0027) { # '
2771     !!!cp (211);
2772     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2773     !!!next-input-character;
2774     redo A;
2775 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2776 wakaba 1.1 !!!cp (212);
2777     !!!parse-error (type => 'unclosed SYSTEM literal');
2778    
2779     $self->{state} = DATA_STATE;
2780 wakaba 1.5 $self->{s_kwd} = '';
2781 wakaba 1.1 !!!next-input-character;
2782    
2783     $self->{ct}->{quirks} = 1;
2784     !!!emit ($self->{ct}); # DOCTYPE
2785    
2786     redo A;
2787     } elsif ($self->{nc} == -1) {
2788     !!!parse-error (type => 'unclosed SYSTEM literal');
2789    
2790 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2791     !!!cp (213);
2792     $self->{state} = DATA_STATE;
2793     $self->{s_kwd} = '';
2794     $self->{ct}->{quirks} = 1;
2795     } else {
2796     !!!cp (213.1);
2797     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2798     }
2799    
2800 wakaba 1.1 ## reconsume
2801 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2802 wakaba 1.1 redo A;
2803     } else {
2804     !!!cp (214);
2805 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2806 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2807     length $self->{ct}->{sysid});
2808    
2809     ## Stay in the state
2810     !!!next-input-character;
2811     redo A;
2812     }
2813     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2814     if ($is_space->{$self->{nc}}) {
2815     !!!cp (215);
2816     ## Stay in the state
2817     !!!next-input-character;
2818     redo A;
2819     } elsif ($self->{nc} == 0x003E) { # >
2820 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2821     !!!cp (216);
2822     $self->{state} = DATA_STATE;
2823     $self->{s_kwd} = '';
2824     } else {
2825     !!!cp (216.1);
2826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2827     }
2828    
2829 wakaba 1.1 !!!next-input-character;
2830 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2831 wakaba 1.1 redo A;
2832 wakaba 1.16 ## TODO: "NDATA"
2833 wakaba 1.1 } elsif ($self->{nc} == -1) {
2834 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2835     !!!cp (217);
2836     !!!parse-error (type => 'unclosed DOCTYPE');
2837     $self->{state} = DATA_STATE;
2838     $self->{s_kwd} = '';
2839     $self->{ct}->{quirks} = 1;
2840     } else {
2841     !!!cp (217.1);
2842     !!!parse-error (type => 'unclosed md'); ## TODO: type
2843     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2844     }
2845    
2846 wakaba 1.1 ## reconsume
2847 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2848 wakaba 1.1 redo A;
2849 wakaba 1.16 } elsif ($self->{is_xml} and
2850     $self->{ct}->{type} == DOCTYPE_TOKEN and
2851     $self->{nc} == 0x005B) { # [
2852 wakaba 1.12 !!!cp (218.1);
2853     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2854     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2855 wakaba 1.13 $self->{in_subset} = 1;
2856 wakaba 1.12 !!!next-input-character;
2857 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2858 wakaba 1.12 redo A;
2859 wakaba 1.1 } else {
2860     !!!parse-error (type => 'string after SYSTEM literal');
2861    
2862 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2863     !!!cp (218);
2864     #$self->{ct}->{quirks} = 1;
2865     $self->{state} = BOGUS_DOCTYPE_STATE;
2866     } else {
2867     !!!cp (218.2);
2868     $self->{state} = BOGUS_MD_STATE;
2869     }
2870    
2871 wakaba 1.1 !!!next-input-character;
2872     redo A;
2873     }
2874     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2875     if ($self->{nc} == 0x003E) { # >
2876     !!!cp (219);
2877     $self->{state} = DATA_STATE;
2878 wakaba 1.5 $self->{s_kwd} = '';
2879 wakaba 1.1 !!!next-input-character;
2880    
2881     !!!emit ($self->{ct}); # DOCTYPE
2882    
2883     redo A;
2884 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2885 wakaba 1.13 !!!cp (220.1);
2886     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2887     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2888     $self->{in_subset} = 1;
2889     !!!next-input-character;
2890     !!!emit ($self->{ct}); # DOCTYPE
2891     redo A;
2892 wakaba 1.1 } elsif ($self->{nc} == -1) {
2893     !!!cp (220);
2894     $self->{state} = DATA_STATE;
2895 wakaba 1.5 $self->{s_kwd} = '';
2896 wakaba 1.1 ## reconsume
2897    
2898     !!!emit ($self->{ct}); # DOCTYPE
2899    
2900     redo A;
2901     } else {
2902     !!!cp (221);
2903     my $s = '';
2904 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2905 wakaba 1.1
2906     ## Stay in the state
2907     !!!next-input-character;
2908     redo A;
2909     }
2910     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2911     ## NOTE: "CDATA section state" in the state is jointly implemented
2912     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2913     ## and |CDATA_SECTION_MSE2_STATE|.
2914 wakaba 1.10
2915     ## XML5: "CDATA state".
2916 wakaba 1.1
2917     if ($self->{nc} == 0x005D) { # ]
2918     !!!cp (221.1);
2919     $self->{state} = CDATA_SECTION_MSE1_STATE;
2920     !!!next-input-character;
2921     redo A;
2922     } elsif ($self->{nc} == -1) {
2923 wakaba 1.6 if ($self->{is_xml}) {
2924 wakaba 1.8 !!!cp (221.11);
2925 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2926 wakaba 1.8 } else {
2927     !!!cp (221.12);
2928 wakaba 1.6 }
2929    
2930 wakaba 1.1 $self->{state} = DATA_STATE;
2931 wakaba 1.5 $self->{s_kwd} = '';
2932 wakaba 1.10 ## Reconsume.
2933 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2934     !!!cp (221.2);
2935     !!!emit ($self->{ct}); # character
2936     } else {
2937     !!!cp (221.3);
2938     ## No token to emit. $self->{ct} is discarded.
2939     }
2940     redo A;
2941     } else {
2942     !!!cp (221.4);
2943     $self->{ct}->{data} .= chr $self->{nc};
2944     $self->{read_until}->($self->{ct}->{data},
2945     q<]>,
2946     length $self->{ct}->{data});
2947    
2948     ## Stay in the state.
2949     !!!next-input-character;
2950     redo A;
2951     }
2952    
2953     ## ISSUE: "text tokens" in spec.
2954     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2955 wakaba 1.10 ## XML5: "CDATA bracket state".
2956    
2957 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2958     !!!cp (221.5);
2959     $self->{state} = CDATA_SECTION_MSE2_STATE;
2960     !!!next-input-character;
2961     redo A;
2962     } else {
2963     !!!cp (221.6);
2964 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2965 wakaba 1.1 $self->{ct}->{data} .= ']';
2966 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2967 wakaba 1.1 ## Reconsume.
2968     redo A;
2969     }
2970     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2971 wakaba 1.10 ## XML5: "CDATA end state".
2972    
2973 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2974     $self->{state} = DATA_STATE;
2975 wakaba 1.5 $self->{s_kwd} = '';
2976 wakaba 1.1 !!!next-input-character;
2977     if (length $self->{ct}->{data}) { # character
2978     !!!cp (221.7);
2979     !!!emit ($self->{ct}); # character
2980     } else {
2981     !!!cp (221.8);
2982     ## No token to emit. $self->{ct} is discarded.
2983     }
2984     redo A;
2985     } elsif ($self->{nc} == 0x005D) { # ]
2986     !!!cp (221.9); # character
2987     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2988     ## Stay in the state.
2989     !!!next-input-character;
2990     redo A;
2991     } else {
2992     !!!cp (221.11);
2993     $self->{ct}->{data} .= ']]'; # character
2994     $self->{state} = CDATA_SECTION_STATE;
2995 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2996 wakaba 1.1 redo A;
2997     }
2998     } elsif ($self->{state} == ENTITY_STATE) {
2999     if ($is_space->{$self->{nc}} or
3000     {
3001     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3002     $self->{entity_add} => 1,
3003     }->{$self->{nc}}) {
3004     !!!cp (1001);
3005     ## Don't consume
3006     ## No error
3007     ## Return nothing.
3008     #
3009     } elsif ($self->{nc} == 0x0023) { # #
3010     !!!cp (999);
3011     $self->{state} = ENTITY_HASH_STATE;
3012 wakaba 1.12 $self->{kwd} = '#';
3013 wakaba 1.1 !!!next-input-character;
3014     redo A;
3015     } elsif ((0x0041 <= $self->{nc} and
3016     $self->{nc} <= 0x005A) or # A..Z
3017     (0x0061 <= $self->{nc} and
3018     $self->{nc} <= 0x007A)) { # a..z
3019     !!!cp (998);
3020     require Whatpm::_NamedEntityList;
3021     $self->{state} = ENTITY_NAME_STATE;
3022 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3023     $self->{entity__value} = $self->{kwd};
3024 wakaba 1.1 $self->{entity__match} = 0;
3025     !!!next-input-character;
3026     redo A;
3027     } else {
3028     !!!cp (1027);
3029     !!!parse-error (type => 'bare ero');
3030     ## Return nothing.
3031     #
3032     }
3033    
3034     ## NOTE: No character is consumed by the "consume a character
3035     ## reference" algorithm. In other word, there is an "&" character
3036     ## that does not introduce a character reference, which would be
3037     ## appended to the parent element or the attribute value in later
3038     ## process of the tokenizer.
3039    
3040     if ($self->{prev_state} == DATA_STATE) {
3041     !!!cp (997);
3042     $self->{state} = $self->{prev_state};
3043 wakaba 1.5 $self->{s_kwd} = '';
3044 wakaba 1.1 ## Reconsume.
3045     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3046     line => $self->{line_prev},
3047     column => $self->{column_prev},
3048     });
3049     redo A;
3050     } else {
3051     !!!cp (996);
3052     $self->{ca}->{value} .= '&';
3053     $self->{state} = $self->{prev_state};
3054 wakaba 1.5 $self->{s_kwd} = '';
3055 wakaba 1.1 ## Reconsume.
3056     redo A;
3057     }
3058     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3059     if ($self->{nc} == 0x0078 or # x
3060     $self->{nc} == 0x0058) { # X
3061     !!!cp (995);
3062     $self->{state} = HEXREF_X_STATE;
3063 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3064 wakaba 1.1 !!!next-input-character;
3065     redo A;
3066     } elsif (0x0030 <= $self->{nc} and
3067     $self->{nc} <= 0x0039) { # 0..9
3068     !!!cp (994);
3069     $self->{state} = NCR_NUM_STATE;
3070 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3071 wakaba 1.1 !!!next-input-character;
3072     redo A;
3073     } else {
3074     !!!parse-error (type => 'bare nero',
3075     line => $self->{line_prev},
3076     column => $self->{column_prev} - 1);
3077    
3078     ## NOTE: According to the spec algorithm, nothing is returned,
3079     ## and then "&#" is appended to the parent element or the attribute
3080     ## value in the later processing.
3081    
3082     if ($self->{prev_state} == DATA_STATE) {
3083     !!!cp (1019);
3084     $self->{state} = $self->{prev_state};
3085 wakaba 1.5 $self->{s_kwd} = '';
3086 wakaba 1.1 ## Reconsume.
3087     !!!emit ({type => CHARACTER_TOKEN,
3088     data => '&#',
3089     line => $self->{line_prev},
3090     column => $self->{column_prev} - 1,
3091     });
3092     redo A;
3093     } else {
3094     !!!cp (993);
3095     $self->{ca}->{value} .= '&#';
3096     $self->{state} = $self->{prev_state};
3097 wakaba 1.5 $self->{s_kwd} = '';
3098 wakaba 1.1 ## Reconsume.
3099     redo A;
3100     }
3101     }
3102     } elsif ($self->{state} == NCR_NUM_STATE) {
3103     if (0x0030 <= $self->{nc} and
3104     $self->{nc} <= 0x0039) { # 0..9
3105     !!!cp (1012);
3106 wakaba 1.12 $self->{kwd} *= 10;
3107     $self->{kwd} += $self->{nc} - 0x0030;
3108 wakaba 1.1
3109     ## Stay in the state.
3110     !!!next-input-character;
3111     redo A;
3112     } elsif ($self->{nc} == 0x003B) { # ;
3113     !!!cp (1013);
3114     !!!next-input-character;
3115     #
3116     } else {
3117     !!!cp (1014);
3118     !!!parse-error (type => 'no refc');
3119     ## Reconsume.
3120     #
3121     }
3122    
3123 wakaba 1.12 my $code = $self->{kwd};
3124 wakaba 1.1 my $l = $self->{line_prev};
3125     my $c = $self->{column_prev};
3126     if ($charref_map->{$code}) {
3127     !!!cp (1015);
3128     !!!parse-error (type => 'invalid character reference',
3129     text => (sprintf 'U+%04X', $code),
3130     line => $l, column => $c);
3131     $code = $charref_map->{$code};
3132     } elsif ($code > 0x10FFFF) {
3133     !!!cp (1016);
3134     !!!parse-error (type => 'invalid character reference',
3135     text => (sprintf 'U-%08X', $code),
3136     line => $l, column => $c);
3137     $code = 0xFFFD;
3138     }
3139    
3140     if ($self->{prev_state} == DATA_STATE) {
3141     !!!cp (992);
3142     $self->{state} = $self->{prev_state};
3143 wakaba 1.5 $self->{s_kwd} = '';
3144 wakaba 1.1 ## Reconsume.
3145     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3146 wakaba 1.7 has_reference => 1,
3147 wakaba 1.1 line => $l, column => $c,
3148     });
3149     redo A;
3150     } else {
3151     !!!cp (991);
3152     $self->{ca}->{value} .= chr $code;
3153     $self->{ca}->{has_reference} = 1;
3154     $self->{state} = $self->{prev_state};
3155 wakaba 1.5 $self->{s_kwd} = '';
3156 wakaba 1.1 ## Reconsume.
3157     redo A;
3158     }
3159     } elsif ($self->{state} == HEXREF_X_STATE) {
3160     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3161     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3162     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3163     # 0..9, A..F, a..f
3164     !!!cp (990);
3165     $self->{state} = HEXREF_HEX_STATE;
3166 wakaba 1.12 $self->{kwd} = 0;
3167 wakaba 1.1 ## Reconsume.
3168     redo A;
3169     } else {
3170     !!!parse-error (type => 'bare hcro',
3171     line => $self->{line_prev},
3172     column => $self->{column_prev} - 2);
3173    
3174     ## NOTE: According to the spec algorithm, nothing is returned,
3175     ## and then "&#" followed by "X" or "x" is appended to the parent
3176     ## element or the attribute value in the later processing.
3177    
3178     if ($self->{prev_state} == DATA_STATE) {
3179     !!!cp (1005);
3180     $self->{state} = $self->{prev_state};
3181 wakaba 1.5 $self->{s_kwd} = '';
3182 wakaba 1.1 ## Reconsume.
3183     !!!emit ({type => CHARACTER_TOKEN,
3184 wakaba 1.12 data => '&' . $self->{kwd},
3185 wakaba 1.1 line => $self->{line_prev},
3186 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3187 wakaba 1.1 });
3188     redo A;
3189     } else {
3190     !!!cp (989);
3191 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3192 wakaba 1.1 $self->{state} = $self->{prev_state};
3193 wakaba 1.5 $self->{s_kwd} = '';
3194 wakaba 1.1 ## Reconsume.
3195     redo A;
3196     }
3197     }
3198     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3199     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3200     # 0..9
3201     !!!cp (1002);
3202 wakaba 1.12 $self->{kwd} *= 0x10;
3203     $self->{kwd} += $self->{nc} - 0x0030;
3204 wakaba 1.1 ## Stay in the state.
3205     !!!next-input-character;
3206     redo A;
3207     } elsif (0x0061 <= $self->{nc} and
3208     $self->{nc} <= 0x0066) { # a..f
3209     !!!cp (1003);
3210 wakaba 1.12 $self->{kwd} *= 0x10;
3211     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3212 wakaba 1.1 ## Stay in the state.
3213     !!!next-input-character;
3214     redo A;
3215     } elsif (0x0041 <= $self->{nc} and
3216     $self->{nc} <= 0x0046) { # A..F
3217     !!!cp (1004);
3218 wakaba 1.12 $self->{kwd} *= 0x10;
3219     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3220 wakaba 1.1 ## Stay in the state.
3221     !!!next-input-character;
3222     redo A;
3223     } elsif ($self->{nc} == 0x003B) { # ;
3224     !!!cp (1006);
3225     !!!next-input-character;
3226     #
3227     } else {
3228     !!!cp (1007);
3229     !!!parse-error (type => 'no refc',
3230     line => $self->{line},
3231     column => $self->{column});
3232     ## Reconsume.
3233     #
3234     }
3235    
3236 wakaba 1.12 my $code = $self->{kwd};
3237 wakaba 1.1 my $l = $self->{line_prev};
3238     my $c = $self->{column_prev};
3239     if ($charref_map->{$code}) {
3240     !!!cp (1008);
3241     !!!parse-error (type => 'invalid character reference',
3242     text => (sprintf 'U+%04X', $code),
3243     line => $l, column => $c);
3244     $code = $charref_map->{$code};
3245     } elsif ($code > 0x10FFFF) {
3246     !!!cp (1009);
3247     !!!parse-error (type => 'invalid character reference',
3248     text => (sprintf 'U-%08X', $code),
3249     line => $l, column => $c);
3250     $code = 0xFFFD;
3251     }
3252    
3253     if ($self->{prev_state} == DATA_STATE) {
3254     !!!cp (988);
3255     $self->{state} = $self->{prev_state};
3256 wakaba 1.5 $self->{s_kwd} = '';
3257 wakaba 1.1 ## Reconsume.
3258     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3259 wakaba 1.7 has_reference => 1,
3260 wakaba 1.1 line => $l, column => $c,
3261     });
3262     redo A;
3263     } else {
3264     !!!cp (987);
3265     $self->{ca}->{value} .= chr $code;
3266     $self->{ca}->{has_reference} = 1;
3267     $self->{state} = $self->{prev_state};
3268 wakaba 1.5 $self->{s_kwd} = '';
3269 wakaba 1.1 ## Reconsume.
3270     redo A;
3271     }
3272     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3273 wakaba 1.12 if (length $self->{kwd} < 30 and
3274 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3275     ((0x0041 <= $self->{nc} and # a
3276     $self->{nc} <= 0x005A) or # x
3277     (0x0061 <= $self->{nc} and # a
3278     $self->{nc} <= 0x007A) or # z
3279     (0x0030 <= $self->{nc} and # 0
3280     $self->{nc} <= 0x0039) or # 9
3281     $self->{nc} == 0x003B)) { # ;
3282     our $EntityChar;
3283 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3284     if (defined $EntityChar->{$self->{kwd}}) {
3285 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3286     !!!cp (1020);
3287 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3288 wakaba 1.1 $self->{entity__match} = 1;
3289     !!!next-input-character;
3290     #
3291     } else {
3292     !!!cp (1021);
3293 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3294 wakaba 1.1 $self->{entity__match} = -1;
3295     ## Stay in the state.
3296     !!!next-input-character;
3297     redo A;
3298     }
3299     } else {
3300     !!!cp (1022);
3301     $self->{entity__value} .= chr $self->{nc};
3302     $self->{entity__match} *= 2;
3303     ## Stay in the state.
3304     !!!next-input-character;
3305     redo A;
3306     }
3307     }
3308    
3309     my $data;
3310     my $has_ref;
3311     if ($self->{entity__match} > 0) {
3312     !!!cp (1023);
3313     $data = $self->{entity__value};
3314     $has_ref = 1;
3315     #
3316     } elsif ($self->{entity__match} < 0) {
3317     !!!parse-error (type => 'no refc');
3318     if ($self->{prev_state} != DATA_STATE and # in attribute
3319     $self->{entity__match} < -1) {
3320     !!!cp (1024);
3321 wakaba 1.12 $data = '&' . $self->{kwd};
3322 wakaba 1.1 #
3323     } else {
3324     !!!cp (1025);
3325     $data = $self->{entity__value};
3326     $has_ref = 1;
3327     #
3328     }
3329     } else {
3330     !!!cp (1026);
3331     !!!parse-error (type => 'bare ero',
3332     line => $self->{line_prev},
3333 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3334     $data = '&' . $self->{kwd};
3335 wakaba 1.1 #
3336     }
3337    
3338     ## NOTE: In these cases, when a character reference is found,
3339     ## it is consumed and a character token is returned, or, otherwise,
3340     ## nothing is consumed and returned, according to the spec algorithm.
3341     ## In this implementation, anything that has been examined by the
3342     ## tokenizer is appended to the parent element or the attribute value
3343     ## as string, either literal string when no character reference or
3344     ## entity-replaced string otherwise, in this stage, since any characters
3345     ## that would not be consumed are appended in the data state or in an
3346     ## appropriate attribute value state anyway.
3347    
3348     if ($self->{prev_state} == DATA_STATE) {
3349     !!!cp (986);
3350     $self->{state} = $self->{prev_state};
3351 wakaba 1.5 $self->{s_kwd} = '';
3352 wakaba 1.1 ## Reconsume.
3353     !!!emit ({type => CHARACTER_TOKEN,
3354     data => $data,
3355 wakaba 1.7 has_reference => $has_ref,
3356 wakaba 1.1 line => $self->{line_prev},
3357 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3358 wakaba 1.1 });
3359     redo A;
3360     } else {
3361     !!!cp (985);
3362     $self->{ca}->{value} .= $data;
3363     $self->{ca}->{has_reference} = 1 if $has_ref;
3364     $self->{state} = $self->{prev_state};
3365 wakaba 1.5 $self->{s_kwd} = '';
3366 wakaba 1.1 ## Reconsume.
3367     redo A;
3368     }
3369 wakaba 1.8
3370     ## XML-only states
3371    
3372     } elsif ($self->{state} == PI_STATE) {
3373 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3374    
3375 wakaba 1.8 if ($is_space->{$self->{nc}} or
3376 wakaba 1.14 $self->{nc} == 0x003F or # ?
3377 wakaba 1.8 $self->{nc} == -1) {
3378 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3379     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3380     ## "DOCTYPE pi state": Parse error, switch to the "data
3381     ## state".
3382 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3383     line => $self->{line_prev},
3384     column => $self->{column_prev}
3385     - 1 * ($self->{nc} != -1));
3386     $self->{state} = BOGUS_COMMENT_STATE;
3387     ## Reconsume.
3388     $self->{ct} = {type => COMMENT_TOKEN,
3389     data => '?',
3390     line => $self->{line_prev},
3391     column => $self->{column_prev}
3392     - 1 * ($self->{nc} != -1),
3393     };
3394     redo A;
3395     } else {
3396 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3397 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3398     target => chr $self->{nc},
3399     data => '',
3400     line => $self->{line_prev},
3401     column => $self->{column_prev} - 1,
3402     };
3403     $self->{state} = PI_TARGET_STATE;
3404     !!!next-input-character;
3405     redo A;
3406     }
3407     } elsif ($self->{state} == PI_TARGET_STATE) {
3408     if ($is_space->{$self->{nc}}) {
3409     $self->{state} = PI_TARGET_AFTER_STATE;
3410     !!!next-input-character;
3411     redo A;
3412     } elsif ($self->{nc} == -1) {
3413     !!!parse-error (type => 'no pic'); ## TODO: type
3414 wakaba 1.13 if ($self->{in_subset}) {
3415     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3416     } else {
3417     $self->{state} = DATA_STATE;
3418     $self->{s_kwd} = '';
3419     }
3420 wakaba 1.8 ## Reconsume.
3421     !!!emit ($self->{ct}); # pi
3422     redo A;
3423     } elsif ($self->{nc} == 0x003F) { # ?
3424     $self->{state} = PI_AFTER_STATE;
3425     !!!next-input-character;
3426     redo A;
3427     } else {
3428     ## XML5: typo ("tag name" -> "target")
3429     $self->{ct}->{target} .= chr $self->{nc}; # pi
3430     !!!next-input-character;
3431     redo A;
3432     }
3433     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3434     if ($is_space->{$self->{nc}}) {
3435     ## Stay in the state.
3436     !!!next-input-character;
3437     redo A;
3438     } else {
3439     $self->{state} = PI_DATA_STATE;
3440     ## Reprocess.
3441     redo A;
3442     }
3443     } elsif ($self->{state} == PI_DATA_STATE) {
3444     if ($self->{nc} == 0x003F) { # ?
3445     $self->{state} = PI_DATA_AFTER_STATE;
3446     !!!next-input-character;
3447     redo A;
3448     } elsif ($self->{nc} == -1) {
3449     !!!parse-error (type => 'no pic'); ## TODO: type
3450 wakaba 1.13 if ($self->{in_subset}) {
3451 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3452 wakaba 1.13 } else {
3453     $self->{state} = DATA_STATE;
3454     $self->{s_kwd} = '';
3455     }
3456 wakaba 1.8 ## Reprocess.
3457     !!!emit ($self->{ct}); # pi
3458     redo A;
3459     } else {
3460     $self->{ct}->{data} .= chr $self->{nc}; # pi
3461     $self->{read_until}->($self->{ct}->{data}, q[?],
3462     length $self->{ct}->{data});
3463     ## Stay in the state.
3464     !!!next-input-character;
3465     ## Reprocess.
3466     redo A;
3467     }
3468     } elsif ($self->{state} == PI_AFTER_STATE) {
3469 wakaba 1.14 ## XML5: Part of "Pi after state".
3470    
3471 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3472 wakaba 1.13 if ($self->{in_subset}) {
3473     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3474     } else {
3475     $self->{state} = DATA_STATE;
3476     $self->{s_kwd} = '';
3477     }
3478 wakaba 1.8 !!!next-input-character;
3479     !!!emit ($self->{ct}); # pi
3480     redo A;
3481     } elsif ($self->{nc} == 0x003F) { # ?
3482     !!!parse-error (type => 'no s after target', ## TODO: type
3483     line => $self->{line_prev},
3484     column => $self->{column_prev}); ## XML5: no error
3485     $self->{ct}->{data} .= '?';
3486     $self->{state} = PI_DATA_AFTER_STATE;
3487     !!!next-input-character;
3488     redo A;
3489     } else {
3490     !!!parse-error (type => 'no s after target', ## TODO: type
3491     line => $self->{line_prev},
3492     column => $self->{column_prev}
3493     + 1 * ($self->{nc} == -1)); ## XML5: no error
3494     $self->{ct}->{data} .= '?'; ## XML5: not appended
3495     $self->{state} = PI_DATA_STATE;
3496     ## Reprocess.
3497     redo A;
3498     }
3499     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3500 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3501    
3502 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3503 wakaba 1.13 if ($self->{in_subset}) {
3504     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3505     } else {
3506     $self->{state} = DATA_STATE;
3507     $self->{s_kwd} = '';
3508     }
3509 wakaba 1.8 !!!next-input-character;
3510     !!!emit ($self->{ct}); # pi
3511     redo A;
3512     } elsif ($self->{nc} == 0x003F) { # ?
3513     $self->{ct}->{data} .= '?';
3514     ## Stay in the state.
3515     !!!next-input-character;
3516     redo A;
3517     } else {
3518     $self->{ct}->{data} .= '?'; ## XML5: not appended
3519     $self->{state} = PI_DATA_STATE;
3520     ## Reprocess.
3521     redo A;
3522     }
3523 wakaba 1.12
3524     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3525     if ($self->{nc} == 0x003C) { # <
3526 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3527 wakaba 1.12 !!!next-input-character;
3528     redo A;
3529     } elsif ($self->{nc} == 0x0025) { # %
3530     ## XML5: Not defined yet.
3531    
3532     ## TODO:
3533     !!!next-input-character;
3534     redo A;
3535     } elsif ($self->{nc} == 0x005D) { # ]
3536 wakaba 1.13 delete $self->{in_subset};
3537 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3538     !!!next-input-character;
3539     redo A;
3540     } elsif ($is_space->{$self->{nc}}) {
3541     ## Stay in the state.
3542     !!!next-input-character;
3543     redo A;
3544     } elsif ($self->{nc} == -1) {
3545     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3546 wakaba 1.13 delete $self->{in_subset};
3547 wakaba 1.12 $self->{state} = DATA_STATE;
3548     $self->{s_kwd} = '';
3549     ## Reconsume.
3550 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3551 wakaba 1.12 redo A;
3552     } else {
3553     unless ($self->{internal_subset_tainted}) {
3554     ## XML5: No parse error.
3555     !!!parse-error (type => 'string in internal subset');
3556     $self->{internal_subset_tainted} = 1;
3557     }
3558     ## Stay in the state.
3559     !!!next-input-character;
3560     redo A;
3561     }
3562     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3563     if ($self->{nc} == 0x003E) { # >
3564     $self->{state} = DATA_STATE;
3565     $self->{s_kwd} = '';
3566     !!!next-input-character;
3567 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3568 wakaba 1.12 redo A;
3569     } elsif ($self->{nc} == -1) {
3570     !!!parse-error (type => 'unclosed DOCTYPE');
3571     $self->{state} = DATA_STATE;
3572     $self->{s_kwd} = '';
3573     ## Reconsume.
3574 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3575 wakaba 1.12 redo A;
3576     } else {
3577     ## XML5: No parse error and stay in the state.
3578     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3579    
3580 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3581     !!!next-input-character;
3582     redo A;
3583     }
3584     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3585     if ($self->{nc} == 0x003E) { # >
3586     $self->{state} = DATA_STATE;
3587     $self->{s_kwd} = '';
3588     !!!next-input-character;
3589     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3590     redo A;
3591     } elsif ($self->{nc} == -1) {
3592     $self->{state} = DATA_STATE;
3593     $self->{s_kwd} = '';
3594     ## Reconsume.
3595     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3596     redo A;
3597     } else {
3598     ## Stay in the state.
3599     !!!next-input-character;
3600     redo A;
3601     }
3602     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3603     if ($self->{nc} == 0x0021) { # !
3604 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3605 wakaba 1.13 !!!next-input-character;
3606     redo A;
3607     } elsif ($self->{nc} == 0x003F) { # ?
3608     $self->{state} = PI_STATE;
3609     !!!next-input-character;
3610     redo A;
3611     } elsif ($self->{nc} == -1) {
3612     !!!parse-error (type => 'bare stago');
3613     $self->{state} = DATA_STATE;
3614     $self->{s_kwd} = '';
3615     ## Reconsume.
3616     redo A;
3617     } else {
3618     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3619     line => $self->{line_prev},
3620     column => $self->{column_prev});
3621     $self->{state} = BOGUS_COMMENT_STATE;
3622     $self->{ct} = {type => COMMENT_TOKEN,
3623     data => '',
3624     }; ## NOTE: Will be discarded.
3625 wakaba 1.12 !!!next-input-character;
3626     redo A;
3627     }
3628 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3629     ## XML5: "DOCTYPE markup declaration state".
3630    
3631     if ($self->{nc} == 0x002D) { # -
3632     $self->{state} = MD_HYPHEN_STATE;
3633     !!!next-input-character;
3634     redo A;
3635     } elsif ($self->{nc} == 0x0045) { # E
3636     $self->{state} = MD_E_STATE;
3637     $self->{kwd} = chr $self->{nc};
3638     !!!next-input-character;
3639     redo A;
3640     } elsif ($self->{nc} == 0x0041) { # A
3641     $self->{state} = MD_ATTLIST_STATE;
3642     $self->{kwd} = chr $self->{nc};
3643     !!!next-input-character;
3644     redo A;
3645     } elsif ($self->{nc} == 0x004E) { # N
3646     $self->{state} = MD_NOTATION_STATE;
3647     $self->{kwd} = chr $self->{nc};
3648     !!!next-input-character;
3649     redo A;
3650     } else {
3651     #
3652     }
3653    
3654     ## XML5: No parse error.
3655     !!!parse-error (type => 'bogus comment',
3656     line => $self->{line_prev},
3657     column => $self->{column_prev} - 1);
3658     ## Reconsume.
3659     $self->{state} = BOGUS_COMMENT_STATE;
3660     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3661     redo A;
3662     } elsif ($self->{state} == MD_E_STATE) {
3663     if ($self->{nc} == 0x004E) { # N
3664     $self->{state} = MD_ENTITY_STATE;
3665     $self->{kwd} .= chr $self->{nc};
3666     !!!next-input-character;
3667     redo A;
3668     } elsif ($self->{nc} == 0x004C) { # L
3669     ## XML5: <!ELEMENT> not supported.
3670     $self->{state} = MD_ELEMENT_STATE;
3671     $self->{kwd} .= chr $self->{nc};
3672     !!!next-input-character;
3673     redo A;
3674     } else {
3675     ## XML5: No parse error.
3676     !!!parse-error (type => 'bogus comment',
3677     line => $self->{line_prev},
3678     column => $self->{column_prev} - 2
3679     + 1 * ($self->{nc} == -1));
3680     ## Reconsume.
3681     $self->{state} = BOGUS_COMMENT_STATE;
3682     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3683     redo A;
3684     }
3685     } elsif ($self->{state} == MD_ENTITY_STATE) {
3686     if ($self->{nc} == {
3687     'EN' => 0x0054, # T
3688     'ENT' => 0x0049, # I
3689     'ENTI' => 0x0054, # T
3690     }->{$self->{kwd}}) {
3691     ## Stay in the state.
3692     $self->{kwd} .= chr $self->{nc};
3693     !!!next-input-character;
3694     redo A;
3695     } elsif ($self->{kwd} eq 'ENTIT' and
3696     $self->{nc} == 0x0059) { # Y
3697     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3698     line => $self->{line_prev},
3699     column => $self->{column_prev} - 6};
3700     $self->{state} = DOCTYPE_MD_STATE;
3701     !!!next-input-character;
3702     redo A;
3703     } else {
3704     !!!parse-error (type => 'bogus comment',
3705     line => $self->{line_prev},
3706     column => $self->{column_prev} - 1
3707     - (length $self->{kwd})
3708     + 1 * ($self->{nc} == -1));
3709     $self->{state} = BOGUS_COMMENT_STATE;
3710     ## Reconsume.
3711     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3712     redo A;
3713     }
3714     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3715     if ($self->{nc} == {
3716     'EL' => 0x0045, # E
3717     'ELE' => 0x004D, # M
3718     'ELEM' => 0x0045, # E
3719     'ELEME' => 0x004E, # N
3720     }->{$self->{kwd}}) {
3721     ## Stay in the state.
3722     $self->{kwd} .= chr $self->{nc};
3723     !!!next-input-character;
3724     redo A;
3725     } elsif ($self->{kwd} eq 'ELEMEN' and
3726     $self->{nc} == 0x0054) { # T
3727     $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3728     line => $self->{line_prev},
3729     column => $self->{column_prev} - 6};
3730     $self->{state} = DOCTYPE_MD_STATE;
3731     !!!next-input-character;
3732     redo A;
3733     } else {
3734     !!!parse-error (type => 'bogus comment',
3735     line => $self->{line_prev},
3736     column => $self->{column_prev} - 1
3737     - (length $self->{kwd})
3738     + 1 * ($self->{nc} == -1));
3739     $self->{state} = BOGUS_COMMENT_STATE;
3740     ## Reconsume.
3741     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742     redo A;
3743     }
3744     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3745     if ($self->{nc} == {
3746     'A' => 0x0054, # T
3747     'AT' => 0x0054, # T
3748     'ATT' => 0x004C, # L
3749     'ATTL' => 0x0049, # I
3750     'ATTLI' => 0x0053, # S
3751     }->{$self->{kwd}}) {
3752     ## Stay in the state.
3753     $self->{kwd} .= chr $self->{nc};
3754     !!!next-input-character;
3755     redo A;
3756     } elsif ($self->{kwd} eq 'ATTLIS' and
3757     $self->{nc} == 0x0054) { # T
3758     $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3759 wakaba 1.15 attrdefs => [],
3760 wakaba 1.14 line => $self->{line_prev},
3761     column => $self->{column_prev} - 6};
3762     $self->{state} = DOCTYPE_MD_STATE;
3763     !!!next-input-character;
3764     redo A;
3765     } else {
3766     !!!parse-error (type => 'bogus comment',
3767     line => $self->{line_prev},
3768     column => $self->{column_prev} - 1
3769     - (length $self->{kwd})
3770     + 1 * ($self->{nc} == -1));
3771     $self->{state} = BOGUS_COMMENT_STATE;
3772     ## Reconsume.
3773     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3774     redo A;
3775     }
3776     } elsif ($self->{state} == MD_NOTATION_STATE) {
3777     if ($self->{nc} == {
3778     'N' => 0x004F, # O
3779     'NO' => 0x0054, # T
3780     'NOT' => 0x0041, # A
3781     'NOTA' => 0x0054, # T
3782     'NOTAT' => 0x0049, # I
3783     'NOTATI' => 0x004F, # O
3784     }->{$self->{kwd}}) {
3785     ## Stay in the state.
3786     $self->{kwd} .= chr $self->{nc};
3787     !!!next-input-character;
3788     redo A;
3789     } elsif ($self->{kwd} eq 'NOTATIO' and
3790     $self->{nc} == 0x004E) { # N
3791     $self->{ct} = {type => NOTATION_TOKEN, name => '',
3792     line => $self->{line_prev},
3793     column => $self->{column_prev} - 6};
3794     $self->{state} = DOCTYPE_MD_STATE;
3795     !!!next-input-character;
3796     redo A;
3797     } else {
3798     !!!parse-error (type => 'bogus comment',
3799     line => $self->{line_prev},
3800     column => $self->{column_prev} - 1
3801     - (length $self->{kwd})
3802     + 1 * ($self->{nc} == -1));
3803     $self->{state} = BOGUS_COMMENT_STATE;
3804     ## Reconsume.
3805     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3806     redo A;
3807     }
3808     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3809     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3810     ## "DOCTYPE NOTATION state".
3811    
3812     if ($is_space->{$self->{nc}}) {
3813     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3814     $self->{state} = BEFORE_MD_NAME_STATE;
3815     !!!next-input-character;
3816     redo A;
3817     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3818     $self->{nc} == 0x0025) { # %
3819     ## XML5: Switch to the "DOCTYPE bogus comment state".
3820     !!!parse-error (type => 'no space before md name'); ## TODO: type
3821     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3822     !!!next-input-character;
3823     redo A;
3824     } elsif ($self->{nc} == -1) {
3825     !!!parse-error (type => 'unclosed md'); ## TODO: type
3826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3827     ## Reconsume.
3828     redo A;
3829     } elsif ($self->{nc} == 0x003E) { # >
3830     ## XML5: Switch to the "DOCTYPE bogus comment state".
3831     !!!parse-error (type => 'no md name'); ## TODO: type
3832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3833     !!!next-input-character;
3834     redo A;
3835     } else {
3836     ## XML5: Switch to the "DOCTYPE bogus comment state".
3837     !!!parse-error (type => 'no space before md name'); ## TODO: type
3838     $self->{state} = BEFORE_MD_NAME_STATE;
3839     redo A;
3840     }
3841     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3842     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3843     ## before state", "DOCTYPE ATTLIST name before state".
3844    
3845     if ($is_space->{$self->{nc}}) {
3846     ## Stay in the state.
3847     !!!next-input-character;
3848     redo A;
3849     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3850     $self->{nc} == 0x0025) { # %
3851     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3852     !!!next-input-character;
3853     redo A;
3854     } elsif ($self->{nc} == 0x003E) { # >
3855     ## XML5: Same as "Anything else".
3856     !!!parse-error (type => 'no md name'); ## TODO: type
3857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858     !!!next-input-character;
3859     redo A;
3860     } elsif ($self->{nc} == -1) {
3861     !!!parse-error (type => 'unclosed md'); ## TODO: type
3862     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3863     ## Reconsume.
3864     redo A;
3865     } else {
3866     ## XML5: [ATTLIST] Not defined yet.
3867     $self->{ct}->{name} .= chr $self->{nc};
3868     $self->{state} = MD_NAME_STATE;
3869     !!!next-input-character;
3870     redo A;
3871     }
3872     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3873     if ($is_space->{$self->{nc}}) {
3874     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3875     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3876     $self->{state} = BEFORE_MD_NAME_STATE;
3877     !!!next-input-character;
3878     redo A;
3879     } elsif ($self->{nc} == 0x003E) { # >
3880     ## XML5: Same as "Anything else".
3881     !!!parse-error (type => 'no md name'); ## TODO: type
3882     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3883     !!!next-input-character;
3884     redo A;
3885     } elsif ($self->{nc} == -1) {
3886     !!!parse-error (type => 'unclosed md');
3887     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3888     ## Reconsume.
3889     redo A;
3890     } else {
3891     ## XML5: No parse error.
3892     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3893     $self->{state} = BOGUS_COMMENT_STATE;
3894     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3895     ## Reconsume.
3896     redo A;
3897     }
3898     } elsif ($self->{state} == MD_NAME_STATE) {
3899     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3900    
3901     if ($is_space->{$self->{nc}}) {
3902 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3903     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3904     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
3905     ## TODO: ...
3906     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3907     } else { # ENTITY/NOTATION
3908     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3909     }
3910 wakaba 1.14 !!!next-input-character;
3911     redo A;
3912     } elsif ($self->{nc} == 0x003E) { # >
3913     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3914     #
3915     } else {
3916 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
3917 wakaba 1.14 }
3918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3919     !!!next-input-character;
3920     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3921     redo A;
3922     } elsif ($self->{nc} == -1) {
3923     ## XML5: [ATTLIST] No parse error.
3924     !!!parse-error (type => 'unclosed md');
3925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3926     ## Reconsume.
3927     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3928     redo A;
3929     } else {
3930     ## XML5: [ATTLIST] Not defined yet.
3931     $self->{ct}->{name} .= chr $self->{nc};
3932     ## Stay in the state.
3933     !!!next-input-character;
3934     redo A;
3935     }
3936     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3937     if ($is_space->{$self->{nc}}) {
3938     ## Stay in the state.
3939     !!!next-input-character;
3940     redo A;
3941     } elsif ($self->{nc} == 0x003E) { # >
3942     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3943     !!!next-input-character;
3944     !!!emit ($self->{ct}); # ATTLIST
3945     redo A;
3946     } elsif ($self->{nc} == -1) {
3947     ## XML5: No parse error.
3948     !!!parse-error (type => 'unclosed md'); ## TODO: type
3949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3950 wakaba 1.15 !!!emit ($self->{ct});
3951     redo A;
3952     } else {
3953     ## XML5: Not defined yet.
3954     $self->{ca} = {name => chr ($self->{nc}), # attrdef
3955     tokens => [],
3956     line => $self->{line}, column => $self->{column}};
3957     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
3958     !!!next-input-character;
3959     redo A;
3960     }
3961     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
3962     if ($is_space->{$self->{nc}}) {
3963     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
3964     !!!next-input-character;
3965     redo A;
3966     } elsif ($self->{nc} == 0x003E) { # >
3967     ## XML5: Same as "anything else".
3968     !!!parse-error (type => 'no attr type'); ## TODO: type
3969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3970     !!!next-input-character;
3971     !!!emit ($self->{ct}); # ATTLIST
3972     redo A;
3973     } elsif ($self->{nc} == 0x0028) { # (
3974     ## XML5: Same as "anything else".
3975     !!!parse-error (type => 'no space before paren'); ## TODO: type
3976     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
3977     !!!next-input-character;
3978     redo A;
3979     } elsif ($self->{nc} == -1) {
3980     ## XML5: No parse error.
3981     !!!parse-error (type => 'unclosed md'); ## TODO: type
3982     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3983     !!!next-input-character;
3984     !!!emit ($self->{ct}); # ATTLIST
3985     redo A;
3986     } else {
3987     ## XML5: Not defined yet.
3988     $self->{ca}->{name} .= chr $self->{nc};
3989     ## Stay in the state.
3990     !!!next-input-character;
3991     redo A;
3992     }
3993     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
3994     if ($is_space->{$self->{nc}}) {
3995     ## Stay in the state.
3996     !!!next-input-character;
3997     redo A;
3998     } elsif ($self->{nc} == 0x003E) { # >
3999     ## XML5: Same as "anything else".
4000     !!!parse-error (type => 'no attr type'); ## TODO: type
4001     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4002     !!!next-input-character;
4003     !!!emit ($self->{ct}); # ATTLIST
4004     redo A;
4005     } elsif ($self->{nc} == 0x0028) { # (
4006     ## XML5: Same as "anything else".
4007     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4008     !!!next-input-character;
4009     redo A;
4010     } elsif ($self->{nc} == -1) {
4011     ## XML5: No parse error.
4012     !!!parse-error (type => 'unclosed md'); ## TODO: type
4013     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4014     !!!next-input-character;
4015     !!!emit ($self->{ct});
4016 wakaba 1.14 redo A;
4017     } else {
4018     ## XML5: Not defined yet.
4019 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4020     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4021     !!!next-input-character;
4022     redo A;
4023     }
4024     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4025     if ($is_space->{$self->{nc}}) {
4026     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4027     !!!next-input-character;
4028     redo A;
4029     } elsif ($self->{nc} == 0x0023) { # #
4030     ## XML5: Same as "anything else".
4031     !!!parse-error (type => 'no space before default value'); ## TODO: type
4032     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4033     !!!next-input-character;
4034     redo A;
4035     } elsif ($self->{nc} == 0x0022) { # "
4036     ## XML5: Same as "anything else".
4037     !!!parse-error (type => 'no space before default value'); ## TODO: type
4038     $self->{ca}->{value} = '';
4039     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4040     !!!next-input-character;
4041     redo A;
4042     } elsif ($self->{nc} == 0x0027) { # '
4043     ## XML5: Same as "anything else".
4044     !!!parse-error (type => 'no space before default value'); ## TODO: type
4045     $self->{ca}->{value} = '';
4046     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4047     !!!next-input-character;
4048     redo A;
4049     } elsif ($self->{nc} == 0x003E) { # >
4050     ## XML5: Same as "anything else".
4051     !!!parse-error (type => 'no attr default'); ## TODO: type
4052     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4053     !!!next-input-character;
4054     !!!emit ($self->{ct}); # ATTLIST
4055     redo A;
4056     } elsif ($self->{nc} == 0x0028) { # (
4057     ## XML5: Same as "anything else".
4058     !!!parse-error (type => 'no space before paren'); ## TODO: type
4059     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4060     !!!next-input-character;
4061     redo A;
4062     } elsif ($self->{nc} == -1) {
4063     ## XML5: No parse error.
4064     !!!parse-error (type => 'unclosed md'); ## TODO: type
4065     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4066     !!!next-input-character;
4067     !!!emit ($self->{ct});
4068     redo A;
4069     } else {
4070     ## XML5: Not defined yet.
4071     $self->{ca}->{type} .= chr $self->{nc};
4072     ## Stay in the state.
4073     !!!next-input-character;
4074     redo A;
4075     }
4076     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4077     if ($is_space->{$self->{nc}}) {
4078     ## Stay in the state.
4079     !!!next-input-character;
4080     redo A;
4081     } elsif ($self->{nc} == 0x0028) { # (
4082     ## XML5: Same as "anything else".
4083     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4084     !!!next-input-character;
4085     redo A;
4086     } elsif ($self->{nc} == 0x0023) { # #
4087     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4088     !!!next-input-character;
4089     redo A;
4090     } elsif ($self->{nc} == 0x0022) { # "
4091     ## XML5: Same as "anything else".
4092     $self->{ca}->{value} = '';
4093     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4094     !!!next-input-character;
4095     redo A;
4096     } elsif ($self->{nc} == 0x0027) { # '
4097     ## XML5: Same as "anything else".
4098     $self->{ca}->{value} = '';
4099     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4100     !!!next-input-character;
4101     redo A;
4102     } elsif ($self->{nc} == 0x003E) { # >
4103     ## XML5: Same as "anything else".
4104     !!!parse-error (type => 'no attr default'); ## TODO: type
4105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4106     !!!next-input-character;
4107     !!!emit ($self->{ct}); # ATTLIST
4108     redo A;
4109     } elsif ($self->{nc} == -1) {
4110     ## XML5: No parse error.
4111     !!!parse-error (type => 'unclosed md'); ## TODO: type
4112     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4113     !!!next-input-character;
4114     !!!emit ($self->{ct});
4115     redo A;
4116     } else {
4117     ## XML5: Switch to the "DOCTYPE bogus comment state".
4118     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4119     $self->{ca}->{value} = '';
4120     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4121     ## Reconsume.
4122     redo A;
4123     }
4124     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4125     if ($is_space->{$self->{nc}}) {
4126     ## Stay in the state.
4127     !!!next-input-character;
4128     redo A;
4129     } elsif ($self->{nc} == 0x007C) { # |
4130     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4131     ## Stay in the state.
4132     !!!next-input-character;
4133     redo A;
4134     } elsif ($self->{nc} == 0x0029) { # )
4135     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4136     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4137     !!!next-input-character;
4138     redo A;
4139     } elsif ($self->{nc} == 0x003E) { # >
4140     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4142     !!!next-input-character;
4143     !!!emit ($self->{ct}); # ATTLIST
4144     redo A;
4145     } elsif ($self->{nc} == -1) {
4146     ## XML5: No parse error.
4147     !!!parse-error (type => 'unclosed md'); ## TODO: type
4148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4149     !!!next-input-character;
4150     !!!emit ($self->{ct});
4151     redo A;
4152     } else {
4153     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4154     $self->{state} = ALLOWED_TOKEN_STATE;
4155     !!!next-input-character;
4156     redo A;
4157     }
4158     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4159     if ($is_space->{$self->{nc}}) {
4160     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4161     !!!next-input-character;
4162     redo A;
4163     } elsif ($self->{nc} == 0x007C) { # |
4164     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4165     !!!next-input-character;
4166     redo A;
4167     } elsif ($self->{nc} == 0x0029) { # )
4168     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4169     !!!next-input-character;
4170     redo A;
4171     } elsif ($self->{nc} == 0x003E) { # >
4172     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4173     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4174     !!!next-input-character;
4175     !!!emit ($self->{ct}); # ATTLIST
4176     redo A;
4177     } elsif ($self->{nc} == -1) {
4178     ## XML5: No parse error.
4179     !!!parse-error (type => 'unclosed md'); ## TODO: type
4180     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4181     !!!next-input-character;
4182     !!!emit ($self->{ct});
4183     redo A;
4184     } else {
4185     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4186     ## Stay in the state.
4187     !!!next-input-character;
4188     redo A;
4189     }
4190     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4191     if ($is_space->{$self->{nc}}) {
4192     ## Stay in the state.
4193     !!!next-input-character;
4194     redo A;
4195     } elsif ($self->{nc} == 0x007C) { # |
4196     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4197     !!!next-input-character;
4198     redo A;
4199     } elsif ($self->{nc} == 0x0029) { # )
4200     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4201     !!!next-input-character;
4202     redo A;
4203     } elsif ($self->{nc} == 0x003E) { # >
4204     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4205     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4206     !!!next-input-character;
4207     !!!emit ($self->{ct}); # ATTLIST
4208     redo A;
4209     } elsif ($self->{nc} == -1) {
4210     ## XML5: No parse error.
4211     !!!parse-error (type => 'unclosed md'); ## TODO: type
4212     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4213     !!!next-input-character;
4214     !!!emit ($self->{ct});
4215     redo A;
4216     } else {
4217     !!!parse-error (type => 'space in allowed token', ## TODO: type
4218     line => $self->{line_prev},
4219     column => $self->{column_prev});
4220     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4221     $self->{state} = ALLOWED_TOKEN_STATE;
4222     !!!next-input-character;
4223     redo A;
4224     }
4225     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4226     if ($is_space->{$self->{nc}}) {
4227     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4228     !!!next-input-character;
4229     redo A;
4230     } elsif ($self->{nc} == 0x0023) { # #
4231     !!!parse-error (type => 'no space before default value'); ## TODO: type
4232     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4233     !!!next-input-character;
4234     redo A;
4235     } elsif ($self->{nc} == 0x0022) { # "
4236     !!!parse-error (type => 'no space before default value'); ## TODO: type
4237     $self->{ca}->{value} = '';
4238     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4239     !!!next-input-character;
4240     redo A;
4241     } elsif ($self->{nc} == 0x0027) { # '
4242     !!!parse-error (type => 'no space before default value'); ## TODO: type
4243     $self->{ca}->{value} = '';
4244     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4245     !!!next-input-character;
4246     redo A;
4247     } elsif ($self->{nc} == 0x003E) { # >
4248     !!!parse-error (type => 'no attr default'); ## TODO: type
4249     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4250     !!!next-input-character;
4251     !!!emit ($self->{ct}); # ATTLIST
4252     redo A;
4253     } elsif ($self->{nc} == -1) {
4254     !!!parse-error (type => 'unclosed md'); ## TODO: type
4255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4256     !!!next-input-character;
4257     !!!emit ($self->{ct});
4258     redo A;
4259     } else {
4260     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4261     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262     ## Reconsume.
4263     redo A;
4264     }
4265     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4266     if ($is_space->{$self->{nc}}) {
4267     ## Stay in the state.
4268     !!!next-input-character;
4269     redo A;
4270     } elsif ($self->{nc} == 0x0023) { # #
4271     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4272     !!!next-input-character;
4273     redo A;
4274     } elsif ($self->{nc} == 0x0022) { # "
4275     $self->{ca}->{value} = '';
4276     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4277     !!!next-input-character;
4278     redo A;
4279     } elsif ($self->{nc} == 0x0027) { # '
4280     $self->{ca}->{value} = '';
4281     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4282     !!!next-input-character;
4283     redo A;
4284     } elsif ($self->{nc} == 0x003E) { # >
4285     !!!parse-error (type => 'no attr default'); ## TODO: type
4286     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4287     !!!next-input-character;
4288     !!!emit ($self->{ct}); # ATTLIST
4289     redo A;
4290     } elsif ($self->{nc} == -1) {
4291     !!!parse-error (type => 'unclosed md'); ## TODO: type
4292     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4293     !!!next-input-character;
4294     !!!emit ($self->{ct});
4295     redo A;
4296     } else {
4297     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4298     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4299     ## Reconsume.
4300     redo A;
4301     }
4302     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4303     if ($is_space->{$self->{nc}}) {
4304     ## XML5: No parse error.
4305     !!!parse-error (type => 'no default type'); ## TODO: type
4306 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4307 wakaba 1.14 ## Reconsume.
4308     redo A;
4309 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4310     ## XML5: Same as "anything else".
4311     $self->{ca}->{value} = '';
4312     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4313     !!!next-input-character;
4314     redo A;
4315     } elsif ($self->{nc} == 0x0027) { # '
4316     ## XML5: Same as "anything else".
4317     $self->{ca}->{value} = '';
4318     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4319     !!!next-input-character;
4320     redo A;
4321     } elsif ($self->{nc} == 0x003E) { # >
4322     ## XML5: Same as "anything else".
4323     !!!parse-error (type => 'no attr default'); ## TODO: type
4324     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4325     !!!next-input-character;
4326     !!!emit ($self->{ct}); # ATTLIST
4327     redo A;
4328     } elsif ($self->{nc} == -1) {
4329     ## XML5: No parse error.
4330     !!!parse-error (type => 'unclosed md'); ## TODO: type
4331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4332     !!!next-input-character;
4333     !!!emit ($self->{ct});
4334     redo A;
4335     } else {
4336     $self->{ca}->{default} = chr $self->{nc};
4337     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4338     !!!next-input-character;
4339     redo A;
4340 wakaba 1.14 }
4341 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4342     if ($is_space->{$self->{nc}}) {
4343     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4344     !!!next-input-character;
4345     redo A;
4346     } elsif ($self->{nc} == 0x0022) { # "
4347     ## XML5: Same as "anything else".
4348     !!!parse-error (type => 'no space before default value'); ## TODO: type
4349     $self->{ca}->{value} = '';
4350     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4351     !!!next-input-character;
4352     redo A;
4353     } elsif ($self->{nc} == 0x0027) { # '
4354     ## XML5: Same as "anything else".
4355     !!!parse-error (type => 'no space before default value'); ## TODO: type
4356     $self->{ca}->{value} = '';
4357     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4358     !!!next-input-character;
4359     redo A;
4360     } elsif ($self->{nc} == 0x003E) { # >
4361     ## XML5: Same as "anything else".
4362     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364     !!!next-input-character;
4365     !!!emit ($self->{ct}); # ATTLIST
4366     redo A;
4367     } elsif ($self->{nc} == -1) {
4368     ## XML5: No parse error.
4369     !!!parse-error (type => 'unclosed md'); ## TODO: type
4370     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4371     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4372     !!!next-input-character;
4373     !!!emit ($self->{ct});
4374     redo A;
4375     } else {
4376     $self->{ca}->{default} .= chr $self->{nc};
4377     ## Stay in the state.
4378     !!!next-input-character;
4379     redo A;
4380     }
4381     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4382     if ($is_space->{$self->{nc}}) {
4383     ## Stay in the state.
4384     !!!next-input-character;
4385     redo A;
4386     } elsif ($self->{nc} == 0x0022) { # "
4387     $self->{ca}->{value} = '';
4388     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4389     !!!next-input-character;
4390     redo A;
4391     } elsif ($self->{nc} == 0x0027) { # '
4392     $self->{ca}->{value} = '';
4393     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4394     !!!next-input-character;
4395     redo A;
4396     } elsif ($self->{nc} == 0x003E) { # >
4397     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4399     !!!next-input-character;
4400     !!!emit ($self->{ct}); # ATTLIST
4401     redo A;
4402     } elsif ($self->{nc} == -1) {
4403     ## XML5: No parse error.
4404     !!!parse-error (type => 'unclosed md'); ## TODO: type
4405     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4406     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4407     !!!next-input-character;
4408     !!!emit ($self->{ct});
4409     redo A;
4410     } else {
4411     ## XML5: Not defined yet.
4412     if ($self->{ca}->{default} eq 'FIXED') {
4413     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4414     } else {
4415     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4416     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4417     }
4418     ## Reconsume.
4419     redo A;
4420     }
4421     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4422     if ($is_space->{$self->{nc}} or
4423     $self->{nc} == -1 or
4424     $self->{nc} == 0x003E) { # >
4425     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4426     ## Reconsume.
4427     redo A;
4428     } else {
4429     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4430     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4431     ## Reconsume.
4432     redo A;
4433 wakaba 1.16 }
4434    
4435     } elsif ($self->{state} == BOGUS_MD_STATE) {
4436     if ($self->{nc} == 0x003E) { # >
4437     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4438     !!!next-input-character;
4439     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4440     redo A;
4441     } elsif ($self->{nc} == -1) {
4442     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4443     ## Reconsume.
4444     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4445     redo A;
4446     } else {
4447     ## Stay in the state.
4448     !!!next-input-character;
4449     redo A;
4450     }
4451 wakaba 1.1 } else {
4452     die "$0: $self->{state}: Unknown state";
4453     }
4454     } # A
4455    
4456     die "$0: _get_next_token: unexpected case";
4457     } # _get_next_token
4458    
4459     1;
4460 wakaba 1.16 ## $Date: 2008/10/18 08:05:29 $
4461 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24