/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (hide annotations) (download) (as text)
Sun Oct 19 07:19:00 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.18: +101 -6 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	19 Oct 2008 07:18:24 -0000
	* XML-Parser.t: Typo fixed.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 07:18:52 -0000
	* entities-1.dat, entities-2.dat: EntityValue tests added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 07:17:36 -0000
	* NanoDOM.pm (Entity->new): Initialize ->child_nodes as an empty
	array.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 07:18:01 -0000
	* Tokenizer.pm.src: Support for EntityValue.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.19 our $VERSION=do{my @r=(q$Revision: 1.18 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185     sub AFTER_NOTATION_NAME_STATE () { 90 }
186 wakaba 1.19 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188     sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189     sub BOGUS_MD_STATE () { 94 }
190 wakaba 1.8
191 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
192     ## list and descriptions)
193    
194     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
195     sub FOREIGN_EL () { 0b1_00000000000 }
196    
197     ## Character reference mappings
198    
199     my $charref_map = {
200     0x0D => 0x000A,
201     0x80 => 0x20AC,
202     0x81 => 0xFFFD,
203     0x82 => 0x201A,
204     0x83 => 0x0192,
205     0x84 => 0x201E,
206     0x85 => 0x2026,
207     0x86 => 0x2020,
208     0x87 => 0x2021,
209     0x88 => 0x02C6,
210     0x89 => 0x2030,
211     0x8A => 0x0160,
212     0x8B => 0x2039,
213     0x8C => 0x0152,
214     0x8D => 0xFFFD,
215     0x8E => 0x017D,
216     0x8F => 0xFFFD,
217     0x90 => 0xFFFD,
218     0x91 => 0x2018,
219     0x92 => 0x2019,
220     0x93 => 0x201C,
221     0x94 => 0x201D,
222     0x95 => 0x2022,
223     0x96 => 0x2013,
224     0x97 => 0x2014,
225     0x98 => 0x02DC,
226     0x99 => 0x2122,
227     0x9A => 0x0161,
228     0x9B => 0x203A,
229     0x9C => 0x0153,
230     0x9D => 0xFFFD,
231     0x9E => 0x017E,
232     0x9F => 0x0178,
233     }; # $charref_map
234     $charref_map->{$_} = 0xFFFD
235     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
236     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
237     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
238     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
239     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
240     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
241     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
242    
243     ## Implementations MUST act as if state machine in the spec
244    
245     sub _initialize_tokenizer ($) {
246     my $self = shift;
247    
248     ## NOTE: Fields set by |new| constructor:
249     #$self->{level}
250     #$self->{set_nc}
251     #$self->{parse_error}
252 wakaba 1.3 #$self->{is_xml} (if XML)
253 wakaba 1.1
254     $self->{state} = DATA_STATE; # MUST
255 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
256     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
257 wakaba 1.1 #$self->{entity__value}; # initialized when used
258     #$self->{entity__match}; # initialized when used
259     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
260     undef $self->{ct}; # current token
261     undef $self->{ca}; # current attribute
262     undef $self->{last_stag_name}; # last emitted start tag name
263     #$self->{prev_state}; # initialized when used
264     delete $self->{self_closing};
265     $self->{char_buffer} = '';
266     $self->{char_buffer_pos} = 0;
267     $self->{nc} = -1; # next input character
268     #$self->{next_nc}
269     !!!next-input-character;
270     $self->{token} = [];
271     # $self->{escape}
272     } # _initialize_tokenizer
273    
274     ## A token has:
275     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
276 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
277 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
278     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
279 wakaba 1.11 ## ->{target} (PI_TOKEN)
280 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
281     ## ->{sysid} (DOCTYPE_TOKEN)
282     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
283     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
284     ## ->{name}
285     ## ->{value}
286     ## ->{has_reference} == 1 or 0
287 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
288     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
289 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
290 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
291 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
292    
293 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
294     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
295     ## while the token is pushed back to the stack.
296    
297     ## Emitted token MUST immediately be handled by the tree construction state.
298    
299     ## Before each step, UA MAY check to see if either one of the scripts in
300     ## "list of scripts that will execute as soon as possible" or the first
301     ## script in the "list of scripts that will execute asynchronously",
302     ## has completed loading. If one has, then it MUST be executed
303     ## and removed from the list.
304    
305     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
306     ## (This requirement was dropped from HTML5 spec, unfortunately.)
307    
308     my $is_space = {
309     0x0009 => 1, # CHARACTER TABULATION (HT)
310     0x000A => 1, # LINE FEED (LF)
311     #0x000B => 0, # LINE TABULATION (VT)
312 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
313 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
314     0x0020 => 1, # SPACE (SP)
315     };
316    
317     sub _get_next_token ($) {
318     my $self = shift;
319    
320     if ($self->{self_closing}) {
321     !!!parse-error (type => 'nestc', token => $self->{ct});
322     ## NOTE: The |self_closing| flag is only set by start tag token.
323     ## In addition, when a start tag token is emitted, it is always set to
324     ## |ct|.
325     delete $self->{self_closing};
326     }
327    
328     if (@{$self->{token}}) {
329     $self->{self_closing} = $self->{token}->[0]->{self_closing};
330     return shift @{$self->{token}};
331     }
332    
333     A: {
334     if ($self->{state} == PCDATA_STATE) {
335     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
336    
337     if ($self->{nc} == 0x0026) { # &
338     !!!cp (0.1);
339     ## NOTE: In the spec, the tokenizer is switched to the
340     ## "entity data state". In this implementation, the tokenizer
341     ## is switched to the |ENTITY_STATE|, which is an implementation
342     ## of the "consume a character reference" algorithm.
343     $self->{entity_add} = -1;
344     $self->{prev_state} = DATA_STATE;
345     $self->{state} = ENTITY_STATE;
346     !!!next-input-character;
347     redo A;
348     } elsif ($self->{nc} == 0x003C) { # <
349     !!!cp (0.2);
350     $self->{state} = TAG_OPEN_STATE;
351     !!!next-input-character;
352     redo A;
353     } elsif ($self->{nc} == -1) {
354     !!!cp (0.3);
355     !!!emit ({type => END_OF_FILE_TOKEN,
356     line => $self->{line}, column => $self->{column}});
357     last A; ## TODO: ok?
358     } else {
359     !!!cp (0.4);
360     #
361     }
362    
363     # Anything else
364     my $token = {type => CHARACTER_TOKEN,
365     data => chr $self->{nc},
366     line => $self->{line}, column => $self->{column},
367     };
368     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
369    
370     ## Stay in the state.
371     !!!next-input-character;
372     !!!emit ($token);
373     redo A;
374     } elsif ($self->{state} == DATA_STATE) {
375     $self->{s_kwd} = '' unless defined $self->{s_kwd};
376     if ($self->{nc} == 0x0026) { # &
377     $self->{s_kwd} = '';
378     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
379     not $self->{escape}) {
380     !!!cp (1);
381     ## NOTE: In the spec, the tokenizer is switched to the
382     ## "entity data state". In this implementation, the tokenizer
383     ## is switched to the |ENTITY_STATE|, which is an implementation
384     ## of the "consume a character reference" algorithm.
385     $self->{entity_add} = -1;
386     $self->{prev_state} = DATA_STATE;
387     $self->{state} = ENTITY_STATE;
388     !!!next-input-character;
389     redo A;
390     } else {
391     !!!cp (2);
392     #
393     }
394     } elsif ($self->{nc} == 0x002D) { # -
395     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
396 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
397 wakaba 1.1 !!!cp (3);
398     $self->{escape} = 1; # unless $self->{escape};
399     $self->{s_kwd} = '--';
400     #
401 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
402 wakaba 1.1 !!!cp (4);
403     $self->{s_kwd} = '--';
404     #
405 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
406     !!!cp (4.1);
407     $self->{s_kwd} .= '-';
408     #
409 wakaba 1.1 } else {
410     !!!cp (5);
411 wakaba 1.5 $self->{s_kwd} = '-';
412 wakaba 1.1 #
413     }
414     }
415    
416     #
417     } elsif ($self->{nc} == 0x0021) { # !
418     if (length $self->{s_kwd}) {
419     !!!cp (5.1);
420     $self->{s_kwd} .= '!';
421     #
422     } else {
423     !!!cp (5.2);
424     #$self->{s_kwd} = '';
425     #
426     }
427     #
428     } elsif ($self->{nc} == 0x003C) { # <
429     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
430     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
431     not $self->{escape})) {
432     !!!cp (6);
433     $self->{state} = TAG_OPEN_STATE;
434     !!!next-input-character;
435     redo A;
436     } else {
437     !!!cp (7);
438     $self->{s_kwd} = '';
439     #
440     }
441     } elsif ($self->{nc} == 0x003E) { # >
442     if ($self->{escape} and
443     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
444     if ($self->{s_kwd} eq '--') {
445     !!!cp (8);
446     delete $self->{escape};
447 wakaba 1.5 #
448 wakaba 1.1 } else {
449     !!!cp (9);
450 wakaba 1.5 #
451 wakaba 1.1 }
452 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
453     !!!cp (9.1);
454     !!!parse-error (type => 'unmatched mse', ## TODO: type
455     line => $self->{line_prev},
456     column => $self->{column_prev} - 1);
457     #
458 wakaba 1.1 } else {
459     !!!cp (10);
460 wakaba 1.5 #
461 wakaba 1.1 }
462    
463     $self->{s_kwd} = '';
464     #
465 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
466     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
467     !!!cp (10.1);
468     $self->{s_kwd} .= ']';
469     } elsif ($self->{s_kwd} eq ']]') {
470     !!!cp (10.2);
471     #
472     } else {
473     !!!cp (10.3);
474     $self->{s_kwd} = '';
475     }
476     #
477 wakaba 1.1 } elsif ($self->{nc} == -1) {
478     !!!cp (11);
479     $self->{s_kwd} = '';
480     !!!emit ({type => END_OF_FILE_TOKEN,
481     line => $self->{line}, column => $self->{column}});
482     last A; ## TODO: ok?
483     } else {
484     !!!cp (12);
485     $self->{s_kwd} = '';
486     #
487     }
488    
489     # Anything else
490     my $token = {type => CHARACTER_TOKEN,
491     data => chr $self->{nc},
492     line => $self->{line}, column => $self->{column},
493     };
494 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
495 wakaba 1.1 length $token->{data})) {
496     $self->{s_kwd} = '';
497     }
498    
499     ## Stay in the data state.
500 wakaba 1.5 if (not $self->{is_xml} and
501     $self->{content_model} == PCDATA_CONTENT_MODEL) {
502 wakaba 1.1 !!!cp (13);
503     $self->{state} = PCDATA_STATE;
504     } else {
505     !!!cp (14);
506     ## Stay in the state.
507     }
508     !!!next-input-character;
509     !!!emit ($token);
510     redo A;
511     } elsif ($self->{state} == TAG_OPEN_STATE) {
512 wakaba 1.10 ## XML5: "tag state".
513    
514 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
515     if ($self->{nc} == 0x002F) { # /
516     !!!cp (15);
517     !!!next-input-character;
518     $self->{state} = CLOSE_TAG_OPEN_STATE;
519     redo A;
520     } elsif ($self->{nc} == 0x0021) { # !
521     !!!cp (15.1);
522 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
523 wakaba 1.1 #
524     } else {
525     !!!cp (16);
526 wakaba 1.12 $self->{s_kwd} = '';
527 wakaba 1.1 #
528     }
529    
530     ## reconsume
531     $self->{state} = DATA_STATE;
532     !!!emit ({type => CHARACTER_TOKEN, data => '<',
533     line => $self->{line_prev},
534     column => $self->{column_prev},
535     });
536     redo A;
537     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
538     if ($self->{nc} == 0x0021) { # !
539     !!!cp (17);
540     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
541     !!!next-input-character;
542     redo A;
543     } elsif ($self->{nc} == 0x002F) { # /
544     !!!cp (18);
545     $self->{state} = CLOSE_TAG_OPEN_STATE;
546     !!!next-input-character;
547     redo A;
548     } elsif (0x0041 <= $self->{nc} and
549     $self->{nc} <= 0x005A) { # A..Z
550     !!!cp (19);
551     $self->{ct}
552     = {type => START_TAG_TOKEN,
553 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
554 wakaba 1.1 line => $self->{line_prev},
555     column => $self->{column_prev}};
556     $self->{state} = TAG_NAME_STATE;
557     !!!next-input-character;
558     redo A;
559     } elsif (0x0061 <= $self->{nc} and
560     $self->{nc} <= 0x007A) { # a..z
561     !!!cp (20);
562     $self->{ct} = {type => START_TAG_TOKEN,
563     tag_name => chr ($self->{nc}),
564     line => $self->{line_prev},
565     column => $self->{column_prev}};
566     $self->{state} = TAG_NAME_STATE;
567     !!!next-input-character;
568     redo A;
569     } elsif ($self->{nc} == 0x003E) { # >
570     !!!cp (21);
571     !!!parse-error (type => 'empty start tag',
572     line => $self->{line_prev},
573     column => $self->{column_prev});
574     $self->{state} = DATA_STATE;
575 wakaba 1.5 $self->{s_kwd} = '';
576 wakaba 1.1 !!!next-input-character;
577    
578     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
579     line => $self->{line_prev},
580     column => $self->{column_prev},
581     });
582    
583     redo A;
584     } elsif ($self->{nc} == 0x003F) { # ?
585 wakaba 1.8 if ($self->{is_xml}) {
586     !!!cp (22.1);
587     $self->{state} = PI_STATE;
588     !!!next-input-character;
589     redo A;
590     } else {
591     !!!cp (22);
592     !!!parse-error (type => 'pio',
593     line => $self->{line_prev},
594     column => $self->{column_prev});
595     $self->{state} = BOGUS_COMMENT_STATE;
596     $self->{ct} = {type => COMMENT_TOKEN, data => '',
597     line => $self->{line_prev},
598     column => $self->{column_prev},
599     };
600     ## $self->{nc} is intentionally left as is
601     redo A;
602     }
603 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
604 wakaba 1.1 !!!cp (23);
605     !!!parse-error (type => 'bare stago',
606     line => $self->{line_prev},
607     column => $self->{column_prev});
608     $self->{state} = DATA_STATE;
609 wakaba 1.5 $self->{s_kwd} = '';
610 wakaba 1.1 ## reconsume
611    
612     !!!emit ({type => CHARACTER_TOKEN, data => '<',
613     line => $self->{line_prev},
614     column => $self->{column_prev},
615     });
616    
617     redo A;
618 wakaba 1.9 } else {
619     ## XML5: "<:" is a parse error.
620     !!!cp (23.1);
621     $self->{ct} = {type => START_TAG_TOKEN,
622     tag_name => chr ($self->{nc}),
623     line => $self->{line_prev},
624     column => $self->{column_prev}};
625     $self->{state} = TAG_NAME_STATE;
626     !!!next-input-character;
627     redo A;
628 wakaba 1.1 }
629     } else {
630     die "$0: $self->{content_model} in tag open";
631     }
632     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
633     ## NOTE: The "close tag open state" in the spec is implemented as
634     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
635    
636 wakaba 1.10 ## XML5: "end tag state".
637    
638 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640     if (defined $self->{last_stag_name}) {
641     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642 wakaba 1.12 $self->{kwd} = '';
643 wakaba 1.1 ## Reconsume.
644     redo A;
645     } else {
646     ## No start tag token has ever been emitted
647     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648     !!!cp (28);
649     $self->{state} = DATA_STATE;
650 wakaba 1.5 $self->{s_kwd} = '';
651 wakaba 1.1 ## Reconsume.
652     !!!emit ({type => CHARACTER_TOKEN, data => '</',
653     line => $l, column => $c,
654     });
655     redo A;
656     }
657     }
658    
659     if (0x0041 <= $self->{nc} and
660     $self->{nc} <= 0x005A) { # A..Z
661     !!!cp (29);
662     $self->{ct}
663     = {type => END_TAG_TOKEN,
664 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
665 wakaba 1.1 line => $l, column => $c};
666     $self->{state} = TAG_NAME_STATE;
667     !!!next-input-character;
668     redo A;
669     } elsif (0x0061 <= $self->{nc} and
670     $self->{nc} <= 0x007A) { # a..z
671     !!!cp (30);
672     $self->{ct} = {type => END_TAG_TOKEN,
673     tag_name => chr ($self->{nc}),
674     line => $l, column => $c};
675     $self->{state} = TAG_NAME_STATE;
676     !!!next-input-character;
677     redo A;
678     } elsif ($self->{nc} == 0x003E) { # >
679     !!!parse-error (type => 'empty end tag',
680     line => $self->{line_prev}, ## "<" in "</>"
681     column => $self->{column_prev} - 1);
682     $self->{state} = DATA_STATE;
683 wakaba 1.5 $self->{s_kwd} = '';
684 wakaba 1.10 if ($self->{is_xml}) {
685     !!!cp (31);
686     ## XML5: No parse error.
687    
688     ## NOTE: This parser raises a parse error, since it supports
689     ## XML1, not XML5.
690    
691     ## NOTE: A short end tag token.
692     my $ct = {type => END_TAG_TOKEN,
693     tag_name => '',
694     line => $self->{line_prev},
695     column => $self->{column_prev} - 1,
696     };
697     !!!next-input-character;
698     !!!emit ($ct);
699     } else {
700     !!!cp (31.1);
701     !!!next-input-character;
702     }
703 wakaba 1.1 redo A;
704     } elsif ($self->{nc} == -1) {
705     !!!cp (32);
706     !!!parse-error (type => 'bare etago');
707 wakaba 1.5 $self->{s_kwd} = '';
708 wakaba 1.1 $self->{state} = DATA_STATE;
709     # reconsume
710    
711     !!!emit ({type => CHARACTER_TOKEN, data => '</',
712     line => $l, column => $c,
713     });
714    
715     redo A;
716 wakaba 1.10 } elsif (not $self->{is_xml} or
717     $is_space->{$self->{nc}}) {
718 wakaba 1.1 !!!cp (33);
719 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
720     line => $self->{line_prev}, # "<" of "</"
721     column => $self->{column_prev} - 1);
722 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
723     $self->{ct} = {type => COMMENT_TOKEN, data => '',
724     line => $self->{line_prev}, # "<" of "</"
725     column => $self->{column_prev} - 1,
726     };
727     ## NOTE: $self->{nc} is intentionally left as is.
728     ## Although the "anything else" case of the spec not explicitly
729     ## states that the next input character is to be reconsumed,
730     ## it will be included to the |data| of the comment token
731     ## generated from the bogus end tag, as defined in the
732     ## "bogus comment state" entry.
733     redo A;
734 wakaba 1.10 } else {
735     ## XML5: "</:" is a parse error.
736     !!!cp (30.1);
737     $self->{ct} = {type => END_TAG_TOKEN,
738     tag_name => chr ($self->{nc}),
739     line => $l, column => $c};
740     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
741     !!!next-input-character;
742     redo A;
743 wakaba 1.1 }
744     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
745 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
746 wakaba 1.1 if (length $ch) {
747     my $CH = $ch;
748     $ch =~ tr/a-z/A-Z/;
749     my $nch = chr $self->{nc};
750     if ($nch eq $ch or $nch eq $CH) {
751     !!!cp (24);
752     ## Stay in the state.
753 wakaba 1.12 $self->{kwd} .= $nch;
754 wakaba 1.1 !!!next-input-character;
755     redo A;
756     } else {
757     !!!cp (25);
758     $self->{state} = DATA_STATE;
759 wakaba 1.5 $self->{s_kwd} = '';
760 wakaba 1.1 ## Reconsume.
761     !!!emit ({type => CHARACTER_TOKEN,
762 wakaba 1.12 data => '</' . $self->{kwd},
763 wakaba 1.1 line => $self->{line_prev},
764 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
765 wakaba 1.1 });
766     redo A;
767     }
768     } else { # after "<{tag-name}"
769     unless ($is_space->{$self->{nc}} or
770     {
771     0x003E => 1, # >
772     0x002F => 1, # /
773     -1 => 1, # EOF
774     }->{$self->{nc}}) {
775     !!!cp (26);
776     ## Reconsume.
777     $self->{state} = DATA_STATE;
778 wakaba 1.5 $self->{s_kwd} = '';
779 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
780 wakaba 1.12 data => '</' . $self->{kwd},
781 wakaba 1.1 line => $self->{line_prev},
782 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
783 wakaba 1.1 });
784     redo A;
785     } else {
786     !!!cp (27);
787     $self->{ct}
788     = {type => END_TAG_TOKEN,
789     tag_name => $self->{last_stag_name},
790     line => $self->{line_prev},
791 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
792 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
793     ## Reconsume.
794     redo A;
795     }
796     }
797     } elsif ($self->{state} == TAG_NAME_STATE) {
798     if ($is_space->{$self->{nc}}) {
799     !!!cp (34);
800     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
801     !!!next-input-character;
802     redo A;
803     } elsif ($self->{nc} == 0x003E) { # >
804     if ($self->{ct}->{type} == START_TAG_TOKEN) {
805     !!!cp (35);
806     $self->{last_stag_name} = $self->{ct}->{tag_name};
807     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
808     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
809     #if ($self->{ct}->{attributes}) {
810     # ## NOTE: This should never be reached.
811     # !!! cp (36);
812     # !!! parse-error (type => 'end tag attribute');
813     #} else {
814     !!!cp (37);
815     #}
816     } else {
817     die "$0: $self->{ct}->{type}: Unknown token type";
818     }
819     $self->{state} = DATA_STATE;
820 wakaba 1.5 $self->{s_kwd} = '';
821 wakaba 1.1 !!!next-input-character;
822    
823     !!!emit ($self->{ct}); # start tag or end tag
824    
825     redo A;
826     } elsif (0x0041 <= $self->{nc} and
827     $self->{nc} <= 0x005A) { # A..Z
828     !!!cp (38);
829 wakaba 1.4 $self->{ct}->{tag_name}
830     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
831 wakaba 1.1 # start tag or end tag
832     ## Stay in this state
833     !!!next-input-character;
834     redo A;
835     } elsif ($self->{nc} == -1) {
836     !!!parse-error (type => 'unclosed tag');
837     if ($self->{ct}->{type} == START_TAG_TOKEN) {
838     !!!cp (39);
839     $self->{last_stag_name} = $self->{ct}->{tag_name};
840     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
841     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
842     #if ($self->{ct}->{attributes}) {
843     # ## NOTE: This state should never be reached.
844     # !!! cp (40);
845     # !!! parse-error (type => 'end tag attribute');
846     #} else {
847     !!!cp (41);
848     #}
849     } else {
850     die "$0: $self->{ct}->{type}: Unknown token type";
851     }
852     $self->{state} = DATA_STATE;
853 wakaba 1.5 $self->{s_kwd} = '';
854 wakaba 1.1 # reconsume
855    
856     !!!emit ($self->{ct}); # start tag or end tag
857    
858     redo A;
859     } elsif ($self->{nc} == 0x002F) { # /
860     !!!cp (42);
861     $self->{state} = SELF_CLOSING_START_TAG_STATE;
862     !!!next-input-character;
863     redo A;
864     } else {
865     !!!cp (44);
866     $self->{ct}->{tag_name} .= chr $self->{nc};
867     # start tag or end tag
868     ## Stay in the state
869     !!!next-input-character;
870     redo A;
871     }
872     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
873 wakaba 1.11 ## XML5: "Tag attribute name before state".
874    
875 wakaba 1.1 if ($is_space->{$self->{nc}}) {
876     !!!cp (45);
877     ## Stay in the state
878     !!!next-input-character;
879     redo A;
880     } elsif ($self->{nc} == 0x003E) { # >
881     if ($self->{ct}->{type} == START_TAG_TOKEN) {
882     !!!cp (46);
883     $self->{last_stag_name} = $self->{ct}->{tag_name};
884     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
885     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
886     if ($self->{ct}->{attributes}) {
887     !!!cp (47);
888     !!!parse-error (type => 'end tag attribute');
889     } else {
890     !!!cp (48);
891     }
892     } else {
893     die "$0: $self->{ct}->{type}: Unknown token type";
894     }
895     $self->{state} = DATA_STATE;
896 wakaba 1.5 $self->{s_kwd} = '';
897 wakaba 1.1 !!!next-input-character;
898    
899     !!!emit ($self->{ct}); # start tag or end tag
900    
901     redo A;
902     } elsif (0x0041 <= $self->{nc} and
903     $self->{nc} <= 0x005A) { # A..Z
904     !!!cp (49);
905     $self->{ca}
906 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
907 wakaba 1.1 value => '',
908     line => $self->{line}, column => $self->{column}};
909     $self->{state} = ATTRIBUTE_NAME_STATE;
910     !!!next-input-character;
911     redo A;
912     } elsif ($self->{nc} == 0x002F) { # /
913     !!!cp (50);
914     $self->{state} = SELF_CLOSING_START_TAG_STATE;
915     !!!next-input-character;
916     redo A;
917     } elsif ($self->{nc} == -1) {
918     !!!parse-error (type => 'unclosed tag');
919     if ($self->{ct}->{type} == START_TAG_TOKEN) {
920     !!!cp (52);
921     $self->{last_stag_name} = $self->{ct}->{tag_name};
922     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
923     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
924     if ($self->{ct}->{attributes}) {
925     !!!cp (53);
926     !!!parse-error (type => 'end tag attribute');
927     } else {
928     !!!cp (54);
929     }
930     } else {
931     die "$0: $self->{ct}->{type}: Unknown token type";
932     }
933     $self->{state} = DATA_STATE;
934 wakaba 1.5 $self->{s_kwd} = '';
935 wakaba 1.1 # reconsume
936    
937     !!!emit ($self->{ct}); # start tag or end tag
938    
939     redo A;
940     } else {
941     if ({
942     0x0022 => 1, # "
943     0x0027 => 1, # '
944     0x003D => 1, # =
945     }->{$self->{nc}}) {
946     !!!cp (55);
947 wakaba 1.11 ## XML5: Not a parse error.
948 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
949     } else {
950     !!!cp (56);
951 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
952 wakaba 1.1 }
953     $self->{ca}
954     = {name => chr ($self->{nc}),
955     value => '',
956     line => $self->{line}, column => $self->{column}};
957     $self->{state} = ATTRIBUTE_NAME_STATE;
958     !!!next-input-character;
959     redo A;
960     }
961     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
962 wakaba 1.11 ## XML5: "Tag attribute name state".
963    
964 wakaba 1.1 my $before_leave = sub {
965     if (exists $self->{ct}->{attributes} # start tag or end tag
966     ->{$self->{ca}->{name}}) { # MUST
967     !!!cp (57);
968     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
969     ## Discard $self->{ca} # MUST
970     } else {
971     !!!cp (58);
972     $self->{ct}->{attributes}->{$self->{ca}->{name}}
973     = $self->{ca};
974 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
975 wakaba 1.1 }
976     }; # $before_leave
977    
978     if ($is_space->{$self->{nc}}) {
979     !!!cp (59);
980     $before_leave->();
981     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
982     !!!next-input-character;
983     redo A;
984     } elsif ($self->{nc} == 0x003D) { # =
985     !!!cp (60);
986     $before_leave->();
987     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
988     !!!next-input-character;
989     redo A;
990     } elsif ($self->{nc} == 0x003E) { # >
991 wakaba 1.11 if ($self->{is_xml}) {
992     !!!cp (60.1);
993     ## XML5: Not a parse error.
994     !!!parse-error (type => 'no attr value'); ## TODO: type
995     } else {
996     !!!cp (60.2);
997     }
998    
999 wakaba 1.1 $before_leave->();
1000     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1001     !!!cp (61);
1002     $self->{last_stag_name} = $self->{ct}->{tag_name};
1003     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1004     !!!cp (62);
1005     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1006     if ($self->{ct}->{attributes}) {
1007     !!!parse-error (type => 'end tag attribute');
1008     }
1009     } else {
1010     die "$0: $self->{ct}->{type}: Unknown token type";
1011     }
1012     $self->{state} = DATA_STATE;
1013 wakaba 1.5 $self->{s_kwd} = '';
1014 wakaba 1.1 !!!next-input-character;
1015    
1016     !!!emit ($self->{ct}); # start tag or end tag
1017    
1018     redo A;
1019     } elsif (0x0041 <= $self->{nc} and
1020     $self->{nc} <= 0x005A) { # A..Z
1021     !!!cp (63);
1022 wakaba 1.4 $self->{ca}->{name}
1023     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1024 wakaba 1.1 ## Stay in the state
1025     !!!next-input-character;
1026     redo A;
1027     } elsif ($self->{nc} == 0x002F) { # /
1028 wakaba 1.11 if ($self->{is_xml}) {
1029     !!!cp (64);
1030     ## XML5: Not a parse error.
1031     !!!parse-error (type => 'no attr value'); ## TODO: type
1032     } else {
1033     !!!cp (64.1);
1034     }
1035    
1036 wakaba 1.1 $before_leave->();
1037     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1038     !!!next-input-character;
1039     redo A;
1040     } elsif ($self->{nc} == -1) {
1041     !!!parse-error (type => 'unclosed tag');
1042     $before_leave->();
1043     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1044     !!!cp (66);
1045     $self->{last_stag_name} = $self->{ct}->{tag_name};
1046     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1047     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1048     if ($self->{ct}->{attributes}) {
1049     !!!cp (67);
1050     !!!parse-error (type => 'end tag attribute');
1051     } else {
1052     ## NOTE: This state should never be reached.
1053     !!!cp (68);
1054     }
1055     } else {
1056     die "$0: $self->{ct}->{type}: Unknown token type";
1057     }
1058     $self->{state} = DATA_STATE;
1059 wakaba 1.5 $self->{s_kwd} = '';
1060 wakaba 1.1 # reconsume
1061    
1062     !!!emit ($self->{ct}); # start tag or end tag
1063    
1064     redo A;
1065     } else {
1066     if ($self->{nc} == 0x0022 or # "
1067     $self->{nc} == 0x0027) { # '
1068     !!!cp (69);
1069 wakaba 1.11 ## XML5: Not a parse error.
1070 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1071     } else {
1072     !!!cp (70);
1073     }
1074     $self->{ca}->{name} .= chr ($self->{nc});
1075     ## Stay in the state
1076     !!!next-input-character;
1077     redo A;
1078     }
1079     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1080 wakaba 1.11 ## XML5: "Tag attribute name after state".
1081    
1082 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1083     !!!cp (71);
1084     ## Stay in the state
1085     !!!next-input-character;
1086     redo A;
1087     } elsif ($self->{nc} == 0x003D) { # =
1088     !!!cp (72);
1089     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1090     !!!next-input-character;
1091     redo A;
1092     } elsif ($self->{nc} == 0x003E) { # >
1093 wakaba 1.11 if ($self->{is_xml}) {
1094     !!!cp (72.1);
1095     ## XML5: Not a parse error.
1096     !!!parse-error (type => 'no attr value'); ## TODO: type
1097     } else {
1098     !!!cp (72.2);
1099     }
1100    
1101 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1102     !!!cp (73);
1103     $self->{last_stag_name} = $self->{ct}->{tag_name};
1104     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1105     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1106     if ($self->{ct}->{attributes}) {
1107     !!!cp (74);
1108     !!!parse-error (type => 'end tag attribute');
1109     } else {
1110     ## NOTE: This state should never be reached.
1111     !!!cp (75);
1112     }
1113     } else {
1114     die "$0: $self->{ct}->{type}: Unknown token type";
1115     }
1116     $self->{state} = DATA_STATE;
1117 wakaba 1.5 $self->{s_kwd} = '';
1118 wakaba 1.1 !!!next-input-character;
1119    
1120     !!!emit ($self->{ct}); # start tag or end tag
1121    
1122     redo A;
1123     } elsif (0x0041 <= $self->{nc} and
1124     $self->{nc} <= 0x005A) { # A..Z
1125     !!!cp (76);
1126     $self->{ca}
1127 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1128 wakaba 1.1 value => '',
1129     line => $self->{line}, column => $self->{column}};
1130     $self->{state} = ATTRIBUTE_NAME_STATE;
1131     !!!next-input-character;
1132     redo A;
1133     } elsif ($self->{nc} == 0x002F) { # /
1134 wakaba 1.11 if ($self->{is_xml}) {
1135     !!!cp (77);
1136     ## XML5: Not a parse error.
1137     !!!parse-error (type => 'no attr value'); ## TODO: type
1138     } else {
1139     !!!cp (77.1);
1140     }
1141    
1142 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1143     !!!next-input-character;
1144     redo A;
1145     } elsif ($self->{nc} == -1) {
1146     !!!parse-error (type => 'unclosed tag');
1147     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1148     !!!cp (79);
1149     $self->{last_stag_name} = $self->{ct}->{tag_name};
1150     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1151     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1152     if ($self->{ct}->{attributes}) {
1153     !!!cp (80);
1154     !!!parse-error (type => 'end tag attribute');
1155     } else {
1156     ## NOTE: This state should never be reached.
1157     !!!cp (81);
1158     }
1159     } else {
1160     die "$0: $self->{ct}->{type}: Unknown token type";
1161     }
1162 wakaba 1.5 $self->{s_kwd} = '';
1163 wakaba 1.1 $self->{state} = DATA_STATE;
1164     # reconsume
1165    
1166     !!!emit ($self->{ct}); # start tag or end tag
1167    
1168     redo A;
1169     } else {
1170 wakaba 1.11 if ($self->{is_xml}) {
1171     !!!cp (78.1);
1172     ## XML5: Not a parse error.
1173     !!!parse-error (type => 'no attr value'); ## TODO: type
1174     } else {
1175     !!!cp (78.2);
1176     }
1177    
1178 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1179     $self->{nc} == 0x0027) { # '
1180     !!!cp (78);
1181 wakaba 1.11 ## XML5: Not a parse error.
1182 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1183     } else {
1184     !!!cp (82);
1185     }
1186     $self->{ca}
1187     = {name => chr ($self->{nc}),
1188     value => '',
1189     line => $self->{line}, column => $self->{column}};
1190     $self->{state} = ATTRIBUTE_NAME_STATE;
1191     !!!next-input-character;
1192     redo A;
1193     }
1194     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1195 wakaba 1.11 ## XML5: "Tag attribute value before state".
1196    
1197 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1198     !!!cp (83);
1199     ## Stay in the state
1200     !!!next-input-character;
1201     redo A;
1202     } elsif ($self->{nc} == 0x0022) { # "
1203     !!!cp (84);
1204     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1205     !!!next-input-character;
1206     redo A;
1207     } elsif ($self->{nc} == 0x0026) { # &
1208     !!!cp (85);
1209     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1210     ## reconsume
1211     redo A;
1212     } elsif ($self->{nc} == 0x0027) { # '
1213     !!!cp (86);
1214     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1215     !!!next-input-character;
1216     redo A;
1217     } elsif ($self->{nc} == 0x003E) { # >
1218     !!!parse-error (type => 'empty unquoted attribute value');
1219     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1220     !!!cp (87);
1221     $self->{last_stag_name} = $self->{ct}->{tag_name};
1222     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1223     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1224     if ($self->{ct}->{attributes}) {
1225     !!!cp (88);
1226     !!!parse-error (type => 'end tag attribute');
1227     } else {
1228     ## NOTE: This state should never be reached.
1229     !!!cp (89);
1230     }
1231     } else {
1232     die "$0: $self->{ct}->{type}: Unknown token type";
1233     }
1234     $self->{state} = DATA_STATE;
1235 wakaba 1.5 $self->{s_kwd} = '';
1236 wakaba 1.1 !!!next-input-character;
1237    
1238     !!!emit ($self->{ct}); # start tag or end tag
1239    
1240     redo A;
1241     } elsif ($self->{nc} == -1) {
1242     !!!parse-error (type => 'unclosed tag');
1243     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244     !!!cp (90);
1245     $self->{last_stag_name} = $self->{ct}->{tag_name};
1246     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1248     if ($self->{ct}->{attributes}) {
1249     !!!cp (91);
1250     !!!parse-error (type => 'end tag attribute');
1251     } else {
1252     ## NOTE: This state should never be reached.
1253     !!!cp (92);
1254     }
1255     } else {
1256     die "$0: $self->{ct}->{type}: Unknown token type";
1257     }
1258     $self->{state} = DATA_STATE;
1259 wakaba 1.5 $self->{s_kwd} = '';
1260 wakaba 1.1 ## reconsume
1261    
1262     !!!emit ($self->{ct}); # start tag or end tag
1263    
1264     redo A;
1265     } else {
1266     if ($self->{nc} == 0x003D) { # =
1267     !!!cp (93);
1268 wakaba 1.11 ## XML5: Not a parse error.
1269 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1270 wakaba 1.11 } elsif ($self->{is_xml}) {
1271     !!!cp (93.1);
1272     ## XML5: No parse error.
1273     !!!parse-error (type => 'unquoted attr value'); ## TODO
1274 wakaba 1.1 } else {
1275     !!!cp (94);
1276     }
1277     $self->{ca}->{value} .= chr ($self->{nc});
1278     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1279     !!!next-input-character;
1280     redo A;
1281     }
1282     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1283 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1284     ## ATTLIST attribute value double quoted state".
1285 wakaba 1.11
1286 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1287 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1288     !!!cp (95.1);
1289     ## XML5: "DOCTYPE ATTLIST name after state".
1290     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1291     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1292     } else {
1293     !!!cp (95);
1294     ## XML5: "Tag attribute name before state".
1295     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1296     }
1297 wakaba 1.1 !!!next-input-character;
1298     redo A;
1299     } elsif ($self->{nc} == 0x0026) { # &
1300     !!!cp (96);
1301 wakaba 1.11 ## XML5: Not defined yet.
1302    
1303 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1304     ## "entity in attribute value state". In this implementation, the
1305     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1306     ## implementation of the "consume a character reference" algorithm.
1307     $self->{prev_state} = $self->{state};
1308     $self->{entity_add} = 0x0022; # "
1309     $self->{state} = ENTITY_STATE;
1310     !!!next-input-character;
1311     redo A;
1312     } elsif ($self->{nc} == -1) {
1313     !!!parse-error (type => 'unclosed attribute value');
1314     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315     !!!cp (97);
1316     $self->{last_stag_name} = $self->{ct}->{tag_name};
1317 wakaba 1.15
1318     $self->{state} = DATA_STATE;
1319     $self->{s_kwd} = '';
1320     ## reconsume
1321     !!!emit ($self->{ct}); # start tag
1322     redo A;
1323 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325     if ($self->{ct}->{attributes}) {
1326     !!!cp (98);
1327     !!!parse-error (type => 'end tag attribute');
1328     } else {
1329     ## NOTE: This state should never be reached.
1330     !!!cp (99);
1331     }
1332 wakaba 1.15
1333     $self->{state} = DATA_STATE;
1334     $self->{s_kwd} = '';
1335     ## reconsume
1336     !!!emit ($self->{ct}); # end tag
1337     redo A;
1338     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1339     ## XML5: No parse error above; not defined yet.
1340     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1342     ## Reconsume.
1343     !!!emit ($self->{ct}); # ATTLIST
1344     redo A;
1345 wakaba 1.1 } else {
1346     die "$0: $self->{ct}->{type}: Unknown token type";
1347     }
1348     } else {
1349 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1350 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1351     !!!cp (100);
1352     ## XML5: Not a parse error.
1353     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1354     } else {
1355     !!!cp (100.1);
1356     }
1357 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1358     $self->{read_until}->($self->{ca}->{value},
1359 wakaba 1.11 q["&<],
1360 wakaba 1.1 length $self->{ca}->{value});
1361    
1362     ## Stay in the state
1363     !!!next-input-character;
1364     redo A;
1365     }
1366     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1367 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1368     ## ATTLIST attribute value single quoted state".
1369 wakaba 1.11
1370 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1371 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1372     !!!cp (101.1);
1373     ## XML5: "DOCTYPE ATTLIST name after state".
1374     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1375     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1376     } else {
1377     !!!cp (101);
1378     ## XML5: "Before attribute name state" (sic).
1379     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1380     }
1381 wakaba 1.1 !!!next-input-character;
1382     redo A;
1383     } elsif ($self->{nc} == 0x0026) { # &
1384     !!!cp (102);
1385 wakaba 1.11 ## XML5: Not defined yet.
1386    
1387 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1388     ## "entity in attribute value state". In this implementation, the
1389     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1390     ## implementation of the "consume a character reference" algorithm.
1391     $self->{entity_add} = 0x0027; # '
1392     $self->{prev_state} = $self->{state};
1393     $self->{state} = ENTITY_STATE;
1394     !!!next-input-character;
1395     redo A;
1396     } elsif ($self->{nc} == -1) {
1397     !!!parse-error (type => 'unclosed attribute value');
1398     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1399     !!!cp (103);
1400     $self->{last_stag_name} = $self->{ct}->{tag_name};
1401 wakaba 1.15
1402     $self->{state} = DATA_STATE;
1403     $self->{s_kwd} = '';
1404     ## reconsume
1405     !!!emit ($self->{ct}); # start tag
1406     redo A;
1407 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1408     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409     if ($self->{ct}->{attributes}) {
1410     !!!cp (104);
1411     !!!parse-error (type => 'end tag attribute');
1412     } else {
1413     ## NOTE: This state should never be reached.
1414     !!!cp (105);
1415     }
1416 wakaba 1.15
1417     $self->{state} = DATA_STATE;
1418     $self->{s_kwd} = '';
1419     ## reconsume
1420     !!!emit ($self->{ct}); # end tag
1421     redo A;
1422     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1423     ## XML5: No parse error above; not defined yet.
1424     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1425     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1426     ## Reconsume.
1427     !!!emit ($self->{ct}); # ATTLIST
1428     redo A;
1429 wakaba 1.1 } else {
1430     die "$0: $self->{ct}->{type}: Unknown token type";
1431     }
1432     } else {
1433 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1434 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1435     !!!cp (106);
1436     ## XML5: Not a parse error.
1437     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1438     } else {
1439     !!!cp (106.1);
1440     }
1441 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1442     $self->{read_until}->($self->{ca}->{value},
1443 wakaba 1.11 q['&<],
1444 wakaba 1.1 length $self->{ca}->{value});
1445    
1446     ## Stay in the state
1447     !!!next-input-character;
1448     redo A;
1449     }
1450     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1451 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1452    
1453 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1454 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1455     !!!cp (107.1);
1456     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1457     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1458     } else {
1459     !!!cp (107);
1460     ## XML5: "Tag attribute name before state".
1461     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1462     }
1463 wakaba 1.1 !!!next-input-character;
1464     redo A;
1465     } elsif ($self->{nc} == 0x0026) { # &
1466     !!!cp (108);
1467 wakaba 1.11
1468     ## XML5: Not defined yet.
1469    
1470 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1471     ## "entity in attribute value state". In this implementation, the
1472     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1473     ## implementation of the "consume a character reference" algorithm.
1474     $self->{entity_add} = -1;
1475     $self->{prev_state} = $self->{state};
1476     $self->{state} = ENTITY_STATE;
1477     !!!next-input-character;
1478     redo A;
1479     } elsif ($self->{nc} == 0x003E) { # >
1480     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1481     !!!cp (109);
1482     $self->{last_stag_name} = $self->{ct}->{tag_name};
1483 wakaba 1.15
1484     $self->{state} = DATA_STATE;
1485     $self->{s_kwd} = '';
1486     !!!next-input-character;
1487     !!!emit ($self->{ct}); # start tag
1488     redo A;
1489 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1491     if ($self->{ct}->{attributes}) {
1492     !!!cp (110);
1493     !!!parse-error (type => 'end tag attribute');
1494     } else {
1495     ## NOTE: This state should never be reached.
1496     !!!cp (111);
1497     }
1498 wakaba 1.15
1499     $self->{state} = DATA_STATE;
1500     $self->{s_kwd} = '';
1501     !!!next-input-character;
1502     !!!emit ($self->{ct}); # end tag
1503     redo A;
1504     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1506     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1507     !!!next-input-character;
1508     !!!emit ($self->{ct}); # ATTLIST
1509     redo A;
1510 wakaba 1.1 } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     } elsif ($self->{nc} == -1) {
1514     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1515     !!!cp (112);
1516 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1517 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1518 wakaba 1.15
1519     $self->{state} = DATA_STATE;
1520     $self->{s_kwd} = '';
1521     ## reconsume
1522     !!!emit ($self->{ct}); # start tag
1523     redo A;
1524 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1526 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1527     if ($self->{ct}->{attributes}) {
1528     !!!cp (113);
1529     !!!parse-error (type => 'end tag attribute');
1530     } else {
1531     ## NOTE: This state should never be reached.
1532     !!!cp (114);
1533     }
1534 wakaba 1.15
1535     $self->{state} = DATA_STATE;
1536     $self->{s_kwd} = '';
1537     ## reconsume
1538     !!!emit ($self->{ct}); # end tag
1539     redo A;
1540     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1541     !!!parse-error (type => 'unclosed md'); ## TODO: type
1542     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1543     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1544     ## Reconsume.
1545     !!!emit ($self->{ct}); # ATTLIST
1546     redo A;
1547 wakaba 1.1 } else {
1548     die "$0: $self->{ct}->{type}: Unknown token type";
1549     }
1550     } else {
1551     if ({
1552     0x0022 => 1, # "
1553     0x0027 => 1, # '
1554     0x003D => 1, # =
1555     }->{$self->{nc}}) {
1556     !!!cp (115);
1557 wakaba 1.11 ## XML5: Not a parse error.
1558 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1559     } else {
1560     !!!cp (116);
1561     }
1562     $self->{ca}->{value} .= chr ($self->{nc});
1563     $self->{read_until}->($self->{ca}->{value},
1564     q["'=& >],
1565     length $self->{ca}->{value});
1566    
1567     ## Stay in the state
1568     !!!next-input-character;
1569     redo A;
1570     }
1571     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1572     if ($is_space->{$self->{nc}}) {
1573     !!!cp (118);
1574     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1575     !!!next-input-character;
1576     redo A;
1577     } elsif ($self->{nc} == 0x003E) { # >
1578     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1579     !!!cp (119);
1580     $self->{last_stag_name} = $self->{ct}->{tag_name};
1581     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1582     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1583     if ($self->{ct}->{attributes}) {
1584     !!!cp (120);
1585     !!!parse-error (type => 'end tag attribute');
1586     } else {
1587     ## NOTE: This state should never be reached.
1588     !!!cp (121);
1589     }
1590     } else {
1591     die "$0: $self->{ct}->{type}: Unknown token type";
1592     }
1593     $self->{state} = DATA_STATE;
1594 wakaba 1.5 $self->{s_kwd} = '';
1595 wakaba 1.1 !!!next-input-character;
1596    
1597     !!!emit ($self->{ct}); # start tag or end tag
1598    
1599     redo A;
1600     } elsif ($self->{nc} == 0x002F) { # /
1601     !!!cp (122);
1602     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1603     !!!next-input-character;
1604     redo A;
1605     } elsif ($self->{nc} == -1) {
1606     !!!parse-error (type => 'unclosed tag');
1607     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1608     !!!cp (122.3);
1609     $self->{last_stag_name} = $self->{ct}->{tag_name};
1610     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1611     if ($self->{ct}->{attributes}) {
1612     !!!cp (122.1);
1613     !!!parse-error (type => 'end tag attribute');
1614     } else {
1615     ## NOTE: This state should never be reached.
1616     !!!cp (122.2);
1617     }
1618     } else {
1619     die "$0: $self->{ct}->{type}: Unknown token type";
1620     }
1621     $self->{state} = DATA_STATE;
1622 wakaba 1.5 $self->{s_kwd} = '';
1623 wakaba 1.1 ## Reconsume.
1624     !!!emit ($self->{ct}); # start tag or end tag
1625     redo A;
1626     } else {
1627     !!!cp ('124.1');
1628     !!!parse-error (type => 'no space between attributes');
1629     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1630     ## reconsume
1631     redo A;
1632     }
1633     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1634 wakaba 1.11 ## XML5: "Empty tag state".
1635    
1636 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1637     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1638     !!!cp ('124.2');
1639     !!!parse-error (type => 'nestc', token => $self->{ct});
1640     ## TODO: Different type than slash in start tag
1641     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1642     if ($self->{ct}->{attributes}) {
1643     !!!cp ('124.4');
1644     !!!parse-error (type => 'end tag attribute');
1645     } else {
1646     !!!cp ('124.5');
1647     }
1648     ## TODO: Test |<title></title/>|
1649     } else {
1650     !!!cp ('124.3');
1651     $self->{self_closing} = 1;
1652     }
1653    
1654     $self->{state} = DATA_STATE;
1655 wakaba 1.5 $self->{s_kwd} = '';
1656 wakaba 1.1 !!!next-input-character;
1657    
1658     !!!emit ($self->{ct}); # start tag or end tag
1659    
1660     redo A;
1661     } elsif ($self->{nc} == -1) {
1662     !!!parse-error (type => 'unclosed tag');
1663     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1664     !!!cp (124.7);
1665     $self->{last_stag_name} = $self->{ct}->{tag_name};
1666     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1667     if ($self->{ct}->{attributes}) {
1668     !!!cp (124.5);
1669     !!!parse-error (type => 'end tag attribute');
1670     } else {
1671     ## NOTE: This state should never be reached.
1672     !!!cp (124.6);
1673     }
1674     } else {
1675     die "$0: $self->{ct}->{type}: Unknown token type";
1676     }
1677 wakaba 1.11 ## XML5: "Tag attribute name before state".
1678 wakaba 1.1 $self->{state} = DATA_STATE;
1679 wakaba 1.5 $self->{s_kwd} = '';
1680 wakaba 1.1 ## Reconsume.
1681     !!!emit ($self->{ct}); # start tag or end tag
1682     redo A;
1683     } else {
1684     !!!cp ('124.4');
1685     !!!parse-error (type => 'nestc');
1686     ## TODO: This error type is wrong.
1687     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1688     ## Reconsume.
1689     redo A;
1690     }
1691     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1692 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1693    
1694 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1695     ## consumes characters one-by-one basis.
1696    
1697     if ($self->{nc} == 0x003E) { # >
1698 wakaba 1.13 if ($self->{in_subset}) {
1699     !!!cp (123);
1700     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1701     } else {
1702     !!!cp (124);
1703     $self->{state} = DATA_STATE;
1704     $self->{s_kwd} = '';
1705     }
1706 wakaba 1.1 !!!next-input-character;
1707    
1708     !!!emit ($self->{ct}); # comment
1709     redo A;
1710     } elsif ($self->{nc} == -1) {
1711 wakaba 1.13 if ($self->{in_subset}) {
1712     !!!cp (125.1);
1713     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1714     } else {
1715     !!!cp (125);
1716     $self->{state} = DATA_STATE;
1717     $self->{s_kwd} = '';
1718     }
1719 wakaba 1.1 ## reconsume
1720    
1721     !!!emit ($self->{ct}); # comment
1722     redo A;
1723     } else {
1724     !!!cp (126);
1725     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1726     $self->{read_until}->($self->{ct}->{data},
1727     q[>],
1728     length $self->{ct}->{data});
1729    
1730     ## Stay in the state.
1731     !!!next-input-character;
1732     redo A;
1733     }
1734     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1735 wakaba 1.14 ## XML5: "Markup declaration state".
1736 wakaba 1.1
1737     if ($self->{nc} == 0x002D) { # -
1738     !!!cp (133);
1739     $self->{state} = MD_HYPHEN_STATE;
1740     !!!next-input-character;
1741     redo A;
1742     } elsif ($self->{nc} == 0x0044 or # D
1743     $self->{nc} == 0x0064) { # d
1744     ## ASCII case-insensitive.
1745     !!!cp (130);
1746     $self->{state} = MD_DOCTYPE_STATE;
1747 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1748 wakaba 1.1 !!!next-input-character;
1749     redo A;
1750 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1751     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1752     $self->{is_xml}) and
1753 wakaba 1.1 $self->{nc} == 0x005B) { # [
1754     !!!cp (135.4);
1755     $self->{state} = MD_CDATA_STATE;
1756 wakaba 1.12 $self->{kwd} = '[';
1757 wakaba 1.1 !!!next-input-character;
1758     redo A;
1759     } else {
1760     !!!cp (136);
1761     }
1762    
1763     !!!parse-error (type => 'bogus comment',
1764     line => $self->{line_prev},
1765     column => $self->{column_prev} - 1);
1766     ## Reconsume.
1767     $self->{state} = BOGUS_COMMENT_STATE;
1768     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1769     line => $self->{line_prev},
1770     column => $self->{column_prev} - 1,
1771     };
1772     redo A;
1773     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1774     if ($self->{nc} == 0x002D) { # -
1775     !!!cp (127);
1776     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1777     line => $self->{line_prev},
1778     column => $self->{column_prev} - 2,
1779     };
1780 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1781 wakaba 1.1 !!!next-input-character;
1782     redo A;
1783     } else {
1784     !!!cp (128);
1785     !!!parse-error (type => 'bogus comment',
1786     line => $self->{line_prev},
1787     column => $self->{column_prev} - 2);
1788     $self->{state} = BOGUS_COMMENT_STATE;
1789     ## Reconsume.
1790     $self->{ct} = {type => COMMENT_TOKEN,
1791     data => '-',
1792     line => $self->{line_prev},
1793     column => $self->{column_prev} - 2,
1794     };
1795     redo A;
1796     }
1797     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1798     ## ASCII case-insensitive.
1799     if ($self->{nc} == [
1800     undef,
1801     0x004F, # O
1802     0x0043, # C
1803     0x0054, # T
1804     0x0059, # Y
1805     0x0050, # P
1806 wakaba 1.12 ]->[length $self->{kwd}] or
1807 wakaba 1.1 $self->{nc} == [
1808     undef,
1809     0x006F, # o
1810     0x0063, # c
1811     0x0074, # t
1812     0x0079, # y
1813     0x0070, # p
1814 wakaba 1.12 ]->[length $self->{kwd}]) {
1815 wakaba 1.1 !!!cp (131);
1816     ## Stay in the state.
1817 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1818 wakaba 1.1 !!!next-input-character;
1819     redo A;
1820 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1821 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1822     $self->{nc} == 0x0065)) { # e
1823 wakaba 1.12 if ($self->{is_xml} and
1824     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1825 wakaba 1.10 !!!cp (129);
1826     ## XML5: case-sensitive.
1827     !!!parse-error (type => 'lowercase keyword', ## TODO
1828     text => 'DOCTYPE',
1829     line => $self->{line_prev},
1830     column => $self->{column_prev} - 5);
1831     } else {
1832     !!!cp (129.1);
1833     }
1834 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1835     $self->{ct} = {type => DOCTYPE_TOKEN,
1836     quirks => 1,
1837     line => $self->{line_prev},
1838     column => $self->{column_prev} - 7,
1839     };
1840     !!!next-input-character;
1841     redo A;
1842     } else {
1843     !!!cp (132);
1844     !!!parse-error (type => 'bogus comment',
1845     line => $self->{line_prev},
1846 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1847 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1848     ## Reconsume.
1849     $self->{ct} = {type => COMMENT_TOKEN,
1850 wakaba 1.12 data => $self->{kwd},
1851 wakaba 1.1 line => $self->{line_prev},
1852 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1853 wakaba 1.1 };
1854     redo A;
1855     }
1856     } elsif ($self->{state} == MD_CDATA_STATE) {
1857     if ($self->{nc} == {
1858     '[' => 0x0043, # C
1859     '[C' => 0x0044, # D
1860     '[CD' => 0x0041, # A
1861     '[CDA' => 0x0054, # T
1862     '[CDAT' => 0x0041, # A
1863 wakaba 1.12 }->{$self->{kwd}}) {
1864 wakaba 1.1 !!!cp (135.1);
1865     ## Stay in the state.
1866 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1867 wakaba 1.1 !!!next-input-character;
1868     redo A;
1869 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1870 wakaba 1.1 $self->{nc} == 0x005B) { # [
1871 wakaba 1.6 if ($self->{is_xml} and
1872     not $self->{tainted} and
1873     @{$self->{open_elements} or []} == 0) {
1874 wakaba 1.8 !!!cp (135.2);
1875 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1876     line => $self->{line_prev},
1877     column => $self->{column_prev} - 7);
1878     $self->{tainted} = 1;
1879 wakaba 1.8 } else {
1880     !!!cp (135.21);
1881 wakaba 1.6 }
1882    
1883 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1884     data => '',
1885     line => $self->{line_prev},
1886     column => $self->{column_prev} - 7};
1887     $self->{state} = CDATA_SECTION_STATE;
1888     !!!next-input-character;
1889     redo A;
1890     } else {
1891     !!!cp (135.3);
1892     !!!parse-error (type => 'bogus comment',
1893     line => $self->{line_prev},
1894 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1895 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1896     ## Reconsume.
1897     $self->{ct} = {type => COMMENT_TOKEN,
1898 wakaba 1.12 data => $self->{kwd},
1899 wakaba 1.1 line => $self->{line_prev},
1900 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1901 wakaba 1.1 };
1902     redo A;
1903     }
1904     } elsif ($self->{state} == COMMENT_START_STATE) {
1905     if ($self->{nc} == 0x002D) { # -
1906     !!!cp (137);
1907     $self->{state} = COMMENT_START_DASH_STATE;
1908     !!!next-input-character;
1909     redo A;
1910     } elsif ($self->{nc} == 0x003E) { # >
1911     !!!parse-error (type => 'bogus comment');
1912 wakaba 1.13 if ($self->{in_subset}) {
1913     !!!cp (138.1);
1914     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915     } else {
1916     !!!cp (138);
1917     $self->{state} = DATA_STATE;
1918     $self->{s_kwd} = '';
1919     }
1920 wakaba 1.1 !!!next-input-character;
1921    
1922     !!!emit ($self->{ct}); # comment
1923    
1924     redo A;
1925     } elsif ($self->{nc} == -1) {
1926     !!!parse-error (type => 'unclosed comment');
1927 wakaba 1.13 if ($self->{in_subset}) {
1928     !!!cp (139.1);
1929     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1930     } else {
1931     !!!cp (139);
1932     $self->{state} = DATA_STATE;
1933     $self->{s_kwd} = '';
1934     }
1935 wakaba 1.1 ## reconsume
1936    
1937     !!!emit ($self->{ct}); # comment
1938    
1939     redo A;
1940     } else {
1941     !!!cp (140);
1942     $self->{ct}->{data} # comment
1943     .= chr ($self->{nc});
1944     $self->{state} = COMMENT_STATE;
1945     !!!next-input-character;
1946     redo A;
1947     }
1948     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1949     if ($self->{nc} == 0x002D) { # -
1950     !!!cp (141);
1951     $self->{state} = COMMENT_END_STATE;
1952     !!!next-input-character;
1953     redo A;
1954     } elsif ($self->{nc} == 0x003E) { # >
1955     !!!parse-error (type => 'bogus comment');
1956 wakaba 1.13 if ($self->{in_subset}) {
1957     !!!cp (142.1);
1958     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1959     } else {
1960     !!!cp (142);
1961     $self->{state} = DATA_STATE;
1962     $self->{s_kwd} = '';
1963     }
1964 wakaba 1.1 !!!next-input-character;
1965    
1966     !!!emit ($self->{ct}); # comment
1967    
1968     redo A;
1969     } elsif ($self->{nc} == -1) {
1970     !!!parse-error (type => 'unclosed comment');
1971 wakaba 1.13 if ($self->{in_subset}) {
1972     !!!cp (143.1);
1973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1974     } else {
1975     !!!cp (143);
1976     $self->{state} = DATA_STATE;
1977     $self->{s_kwd} = '';
1978     }
1979 wakaba 1.1 ## reconsume
1980    
1981     !!!emit ($self->{ct}); # comment
1982    
1983     redo A;
1984     } else {
1985     !!!cp (144);
1986     $self->{ct}->{data} # comment
1987     .= '-' . chr ($self->{nc});
1988     $self->{state} = COMMENT_STATE;
1989     !!!next-input-character;
1990     redo A;
1991     }
1992     } elsif ($self->{state} == COMMENT_STATE) {
1993 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
1994    
1995 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1996     !!!cp (145);
1997     $self->{state} = COMMENT_END_DASH_STATE;
1998     !!!next-input-character;
1999     redo A;
2000     } elsif ($self->{nc} == -1) {
2001     !!!parse-error (type => 'unclosed comment');
2002 wakaba 1.13 if ($self->{in_subset}) {
2003     !!!cp (146.1);
2004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005     } else {
2006     !!!cp (146);
2007     $self->{state} = DATA_STATE;
2008     $self->{s_kwd} = '';
2009     }
2010 wakaba 1.1 ## reconsume
2011    
2012     !!!emit ($self->{ct}); # comment
2013    
2014     redo A;
2015     } else {
2016     !!!cp (147);
2017     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2018     $self->{read_until}->($self->{ct}->{data},
2019     q[-],
2020     length $self->{ct}->{data});
2021    
2022     ## Stay in the state
2023     !!!next-input-character;
2024     redo A;
2025     }
2026     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2027 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2028 wakaba 1.10
2029 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2030     !!!cp (148);
2031     $self->{state} = COMMENT_END_STATE;
2032     !!!next-input-character;
2033     redo A;
2034     } elsif ($self->{nc} == -1) {
2035     !!!parse-error (type => 'unclosed comment');
2036 wakaba 1.13 if ($self->{in_subset}) {
2037     !!!cp (149.1);
2038     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2039     } else {
2040     !!!cp (149);
2041     $self->{state} = DATA_STATE;
2042     $self->{s_kwd} = '';
2043     }
2044 wakaba 1.1 ## reconsume
2045    
2046     !!!emit ($self->{ct}); # comment
2047    
2048     redo A;
2049     } else {
2050     !!!cp (150);
2051     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2052     $self->{state} = COMMENT_STATE;
2053     !!!next-input-character;
2054     redo A;
2055     }
2056     } elsif ($self->{state} == COMMENT_END_STATE) {
2057 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2058    
2059 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2060 wakaba 1.13 if ($self->{in_subset}) {
2061     !!!cp (151.1);
2062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2063     } else {
2064     !!!cp (151);
2065     $self->{state} = DATA_STATE;
2066     $self->{s_kwd} = '';
2067     }
2068 wakaba 1.1 !!!next-input-character;
2069    
2070     !!!emit ($self->{ct}); # comment
2071    
2072     redo A;
2073     } elsif ($self->{nc} == 0x002D) { # -
2074     !!!cp (152);
2075 wakaba 1.10 ## XML5: Not a parse error.
2076 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2077     line => $self->{line_prev},
2078     column => $self->{column_prev});
2079     $self->{ct}->{data} .= '-'; # comment
2080     ## Stay in the state
2081     !!!next-input-character;
2082     redo A;
2083     } elsif ($self->{nc} == -1) {
2084     !!!parse-error (type => 'unclosed comment');
2085 wakaba 1.13 if ($self->{in_subset}) {
2086     !!!cp (153.1);
2087     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2088     } else {
2089     !!!cp (153);
2090     $self->{state} = DATA_STATE;
2091     $self->{s_kwd} = '';
2092     }
2093 wakaba 1.1 ## reconsume
2094    
2095     !!!emit ($self->{ct}); # comment
2096    
2097     redo A;
2098     } else {
2099     !!!cp (154);
2100 wakaba 1.10 ## XML5: Not a parse error.
2101 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2102     line => $self->{line_prev},
2103     column => $self->{column_prev});
2104     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2105     $self->{state} = COMMENT_STATE;
2106     !!!next-input-character;
2107     redo A;
2108     }
2109     } elsif ($self->{state} == DOCTYPE_STATE) {
2110     if ($is_space->{$self->{nc}}) {
2111     !!!cp (155);
2112     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2113     !!!next-input-character;
2114     redo A;
2115     } else {
2116     !!!cp (156);
2117 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2118 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2119     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2120     ## reconsume
2121     redo A;
2122     }
2123     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2124 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2125    
2126 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2127     !!!cp (157);
2128     ## Stay in the state
2129     !!!next-input-character;
2130     redo A;
2131     } elsif ($self->{nc} == 0x003E) { # >
2132     !!!cp (158);
2133 wakaba 1.12 ## XML5: No parse error.
2134 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2135     $self->{state} = DATA_STATE;
2136 wakaba 1.5 $self->{s_kwd} = '';
2137 wakaba 1.1 !!!next-input-character;
2138    
2139     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2140    
2141     redo A;
2142     } elsif ($self->{nc} == -1) {
2143     !!!cp (159);
2144     !!!parse-error (type => 'no DOCTYPE name');
2145     $self->{state} = DATA_STATE;
2146 wakaba 1.5 $self->{s_kwd} = '';
2147 wakaba 1.1 ## reconsume
2148    
2149     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2150    
2151     redo A;
2152 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2153     !!!cp (159.1);
2154     !!!parse-error (type => 'no DOCTYPE name');
2155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2156 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2157     $self->{in_subset} = 1;
2158 wakaba 1.12 !!!next-input-character;
2159 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2160 wakaba 1.12 redo A;
2161 wakaba 1.1 } else {
2162     !!!cp (160);
2163     $self->{ct}->{name} = chr $self->{nc};
2164     delete $self->{ct}->{quirks};
2165     $self->{state} = DOCTYPE_NAME_STATE;
2166     !!!next-input-character;
2167     redo A;
2168     }
2169     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2170 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2171    
2172     ## ISSUE: Redundant "First," in the spec.
2173    
2174 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2175     !!!cp (161);
2176     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2177     !!!next-input-character;
2178     redo A;
2179     } elsif ($self->{nc} == 0x003E) { # >
2180     !!!cp (162);
2181     $self->{state} = DATA_STATE;
2182 wakaba 1.5 $self->{s_kwd} = '';
2183 wakaba 1.1 !!!next-input-character;
2184    
2185     !!!emit ($self->{ct}); # DOCTYPE
2186    
2187     redo A;
2188     } elsif ($self->{nc} == -1) {
2189     !!!cp (163);
2190     !!!parse-error (type => 'unclosed DOCTYPE');
2191     $self->{state} = DATA_STATE;
2192 wakaba 1.5 $self->{s_kwd} = '';
2193 wakaba 1.1 ## reconsume
2194    
2195     $self->{ct}->{quirks} = 1;
2196     !!!emit ($self->{ct}); # DOCTYPE
2197    
2198     redo A;
2199 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2200     !!!cp (163.1);
2201     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2202 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2203     $self->{in_subset} = 1;
2204 wakaba 1.12 !!!next-input-character;
2205 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2206 wakaba 1.12 redo A;
2207 wakaba 1.1 } else {
2208     !!!cp (164);
2209     $self->{ct}->{name}
2210     .= chr ($self->{nc}); # DOCTYPE
2211     ## Stay in the state
2212     !!!next-input-character;
2213     redo A;
2214     }
2215     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2216 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2217     ## state", but implemented differently.
2218    
2219 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2220     !!!cp (165);
2221     ## Stay in the state
2222     !!!next-input-character;
2223     redo A;
2224     } elsif ($self->{nc} == 0x003E) { # >
2225 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2226     !!!cp (166);
2227     $self->{state} = DATA_STATE;
2228     $self->{s_kwd} = '';
2229     } else {
2230     !!!cp (166.1);
2231     !!!parse-error (type => 'no md def'); ## TODO: type
2232     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2233     }
2234    
2235 wakaba 1.1 !!!next-input-character;
2236 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2237 wakaba 1.1 redo A;
2238     } elsif ($self->{nc} == -1) {
2239 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2240     !!!cp (167);
2241     !!!parse-error (type => 'unclosed DOCTYPE');
2242     $self->{state} = DATA_STATE;
2243     $self->{s_kwd} = '';
2244     $self->{ct}->{quirks} = 1;
2245     } else {
2246     !!!cp (167.12);
2247     !!!parse-error (type => 'unclosed md'); ## TODO: type
2248     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249     }
2250    
2251     ## Reconsume.
2252     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2253 wakaba 1.1 redo A;
2254     } elsif ($self->{nc} == 0x0050 or # P
2255     $self->{nc} == 0x0070) { # p
2256 wakaba 1.12 !!!cp (167.1);
2257 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2258 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2259 wakaba 1.1 !!!next-input-character;
2260     redo A;
2261     } elsif ($self->{nc} == 0x0053 or # S
2262     $self->{nc} == 0x0073) { # s
2263 wakaba 1.12 !!!cp (167.2);
2264 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2265 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2266     !!!next-input-character;
2267     redo A;
2268 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2269     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271     !!!cp (167.21);
2272     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273     $self->{ct}->{value} = ''; # ENTITY
2274     !!!next-input-character;
2275     redo A;
2276     } elsif ($self->{nc} == 0x0027 and # '
2277     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279     !!!cp (167.22);
2280     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281     $self->{ct}->{value} = ''; # ENTITY
2282     !!!next-input-character;
2283     redo A;
2284 wakaba 1.16 } elsif ($self->{is_xml} and
2285     $self->{ct}->{type} == DOCTYPE_TOKEN and
2286     $self->{nc} == 0x005B) { # [
2287 wakaba 1.12 !!!cp (167.3);
2288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2289     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2290 wakaba 1.13 $self->{in_subset} = 1;
2291 wakaba 1.1 !!!next-input-character;
2292 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2293 wakaba 1.1 redo A;
2294     } else {
2295 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2296    
2297     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2298     !!!cp (180);
2299     $self->{ct}->{quirks} = 1;
2300     $self->{state} = BOGUS_DOCTYPE_STATE;
2301     } else {
2302     !!!cp (180.1);
2303     $self->{state} = BOGUS_MD_STATE;
2304     }
2305 wakaba 1.1
2306     !!!next-input-character;
2307     redo A;
2308     }
2309     } elsif ($self->{state} == PUBLIC_STATE) {
2310     ## ASCII case-insensitive
2311     if ($self->{nc} == [
2312     undef,
2313     0x0055, # U
2314     0x0042, # B
2315     0x004C, # L
2316     0x0049, # I
2317 wakaba 1.12 ]->[length $self->{kwd}] or
2318 wakaba 1.1 $self->{nc} == [
2319     undef,
2320     0x0075, # u
2321     0x0062, # b
2322     0x006C, # l
2323     0x0069, # i
2324 wakaba 1.12 ]->[length $self->{kwd}]) {
2325 wakaba 1.1 !!!cp (175);
2326     ## Stay in the state.
2327 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2328 wakaba 1.1 !!!next-input-character;
2329     redo A;
2330 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2331 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2332     $self->{nc} == 0x0063)) { # c
2333 wakaba 1.12 if ($self->{is_xml} and
2334     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2335     !!!cp (168.1);
2336     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2337     text => 'PUBLIC',
2338     line => $self->{line_prev},
2339     column => $self->{column_prev} - 4);
2340     } else {
2341     !!!cp (168);
2342     }
2343 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2344     !!!next-input-character;
2345     redo A;
2346     } else {
2347 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2348 wakaba 1.1 line => $self->{line_prev},
2349 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2350 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2351     !!!cp (169);
2352     $self->{ct}->{quirks} = 1;
2353     $self->{state} = BOGUS_DOCTYPE_STATE;
2354     } else {
2355     !!!cp (169.1);
2356     $self->{state} = BOGUS_MD_STATE;
2357     }
2358 wakaba 1.1 ## Reconsume.
2359     redo A;
2360     }
2361     } elsif ($self->{state} == SYSTEM_STATE) {
2362     ## ASCII case-insensitive
2363     if ($self->{nc} == [
2364     undef,
2365     0x0059, # Y
2366     0x0053, # S
2367     0x0054, # T
2368     0x0045, # E
2369 wakaba 1.12 ]->[length $self->{kwd}] or
2370 wakaba 1.1 $self->{nc} == [
2371     undef,
2372     0x0079, # y
2373     0x0073, # s
2374     0x0074, # t
2375     0x0065, # e
2376 wakaba 1.12 ]->[length $self->{kwd}]) {
2377 wakaba 1.1 !!!cp (170);
2378     ## Stay in the state.
2379 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2380 wakaba 1.1 !!!next-input-character;
2381     redo A;
2382 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2383 wakaba 1.1 ($self->{nc} == 0x004D or # M
2384     $self->{nc} == 0x006D)) { # m
2385 wakaba 1.12 if ($self->{is_xml} and
2386     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2387     !!!cp (171.1);
2388     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2389     text => 'SYSTEM',
2390     line => $self->{line_prev},
2391     column => $self->{column_prev} - 4);
2392     } else {
2393     !!!cp (171);
2394     }
2395 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2396     !!!next-input-character;
2397     redo A;
2398     } else {
2399 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2400 wakaba 1.1 line => $self->{line_prev},
2401 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2402 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2403     !!!cp (172);
2404     $self->{ct}->{quirks} = 1;
2405     $self->{state} = BOGUS_DOCTYPE_STATE;
2406     } else {
2407     !!!cp (172.1);
2408     $self->{state} = BOGUS_MD_STATE;
2409     }
2410 wakaba 1.1 ## Reconsume.
2411     redo A;
2412     }
2413     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2414     if ($is_space->{$self->{nc}}) {
2415     !!!cp (181);
2416     ## Stay in the state
2417     !!!next-input-character;
2418     redo A;
2419     } elsif ($self->{nc} eq 0x0022) { # "
2420     !!!cp (182);
2421     $self->{ct}->{pubid} = ''; # DOCTYPE
2422     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2423     !!!next-input-character;
2424     redo A;
2425     } elsif ($self->{nc} eq 0x0027) { # '
2426     !!!cp (183);
2427     $self->{ct}->{pubid} = ''; # DOCTYPE
2428     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2429     !!!next-input-character;
2430     redo A;
2431     } elsif ($self->{nc} eq 0x003E) { # >
2432     !!!parse-error (type => 'no PUBLIC literal');
2433 wakaba 1.16
2434     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2435     !!!cp (184);
2436     $self->{state} = DATA_STATE;
2437     $self->{s_kwd} = '';
2438     $self->{ct}->{quirks} = 1;
2439     } else {
2440     !!!cp (184.1);
2441     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2442     }
2443    
2444 wakaba 1.1 !!!next-input-character;
2445 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2446 wakaba 1.1 redo A;
2447     } elsif ($self->{nc} == -1) {
2448 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2449     !!!cp (185);
2450     !!!parse-error (type => 'unclosed DOCTYPE');
2451     $self->{state} = DATA_STATE;
2452     $self->{s_kwd} = '';
2453     $self->{ct}->{quirks} = 1;
2454     } else {
2455     !!!cp (185.1);
2456     !!!parse-error (type => 'unclosed md'); ## TODO: type
2457     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2458     }
2459    
2460 wakaba 1.1 ## reconsume
2461     !!!emit ($self->{ct}); # DOCTYPE
2462     redo A;
2463 wakaba 1.16 } elsif ($self->{is_xml} and
2464     $self->{ct}->{type} == DOCTYPE_TOKEN and
2465     $self->{nc} == 0x005B) { # [
2466 wakaba 1.12 !!!cp (186.1);
2467     !!!parse-error (type => 'no PUBLIC literal');
2468     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2469     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2470 wakaba 1.13 $self->{in_subset} = 1;
2471 wakaba 1.12 !!!next-input-character;
2472 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2473 wakaba 1.12 redo A;
2474 wakaba 1.1 } else {
2475     !!!parse-error (type => 'string after PUBLIC');
2476    
2477 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478     !!!cp (186);
2479     $self->{ct}->{quirks} = 1;
2480     $self->{state} = BOGUS_DOCTYPE_STATE;
2481     } else {
2482     !!!cp (186.2);
2483     $self->{state} = BOGUS_MD_STATE;
2484     }
2485    
2486 wakaba 1.1 !!!next-input-character;
2487     redo A;
2488     }
2489     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2490     if ($self->{nc} == 0x0022) { # "
2491     !!!cp (187);
2492     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2493     !!!next-input-character;
2494     redo A;
2495     } elsif ($self->{nc} == 0x003E) { # >
2496     !!!parse-error (type => 'unclosed PUBLIC literal');
2497    
2498 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499     !!!cp (188);
2500     $self->{state} = DATA_STATE;
2501     $self->{s_kwd} = '';
2502     $self->{ct}->{quirks} = 1;
2503     } else {
2504     !!!cp (188.1);
2505     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2506     }
2507    
2508 wakaba 1.1 !!!next-input-character;
2509 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2510 wakaba 1.1 redo A;
2511     } elsif ($self->{nc} == -1) {
2512     !!!parse-error (type => 'unclosed PUBLIC literal');
2513    
2514 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2515     !!!cp (189);
2516     $self->{state} = DATA_STATE;
2517     $self->{s_kwd} = '';
2518     $self->{ct}->{quirks} = 1;
2519     } else {
2520     !!!cp (189.1);
2521     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2522     }
2523    
2524     ## Reconsume.
2525 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2526     redo A;
2527     } else {
2528     !!!cp (190);
2529 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2530 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2531     length $self->{ct}->{pubid});
2532    
2533     ## Stay in the state
2534     !!!next-input-character;
2535     redo A;
2536     }
2537     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2538     if ($self->{nc} == 0x0027) { # '
2539     !!!cp (191);
2540     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2541     !!!next-input-character;
2542     redo A;
2543     } elsif ($self->{nc} == 0x003E) { # >
2544     !!!parse-error (type => 'unclosed PUBLIC literal');
2545    
2546 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547     !!!cp (192);
2548     $self->{state} = DATA_STATE;
2549     $self->{s_kwd} = '';
2550     $self->{ct}->{quirks} = 1;
2551     } else {
2552     !!!cp (192.1);
2553     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2554     }
2555    
2556 wakaba 1.1 !!!next-input-character;
2557 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2558 wakaba 1.1 redo A;
2559     } elsif ($self->{nc} == -1) {
2560     !!!parse-error (type => 'unclosed PUBLIC literal');
2561    
2562 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563     !!!cp (193);
2564     $self->{state} = DATA_STATE;
2565     $self->{s_kwd} = '';
2566     $self->{ct}->{quirks} = 1;
2567     } else {
2568     !!!cp (193.1);
2569     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2570     }
2571    
2572 wakaba 1.1 ## reconsume
2573 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2574 wakaba 1.1 redo A;
2575     } else {
2576     !!!cp (194);
2577 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2578 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2579     length $self->{ct}->{pubid});
2580    
2581     ## Stay in the state
2582     !!!next-input-character;
2583     redo A;
2584     }
2585     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2586     if ($is_space->{$self->{nc}}) {
2587     !!!cp (195);
2588     ## Stay in the state
2589     !!!next-input-character;
2590     redo A;
2591     } elsif ($self->{nc} == 0x0022) { # "
2592     !!!cp (196);
2593 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2594 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595     !!!next-input-character;
2596     redo A;
2597     } elsif ($self->{nc} == 0x0027) { # '
2598     !!!cp (197);
2599 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2600 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601     !!!next-input-character;
2602     redo A;
2603     } elsif ($self->{nc} == 0x003E) { # >
2604 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2605     if ($self->{is_xml}) {
2606     !!!cp (198.1);
2607     !!!parse-error (type => 'no SYSTEM literal');
2608     } else {
2609     !!!cp (198);
2610     }
2611     $self->{state} = DATA_STATE;
2612     $self->{s_kwd} = '';
2613 wakaba 1.12 } else {
2614 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2615     !!!cp (198.2);
2616     } else {
2617     !!!cp (198.3);
2618     !!!parse-error (type => 'no SYSTEM literal');
2619     }
2620     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2621 wakaba 1.12 }
2622 wakaba 1.16
2623 wakaba 1.1 !!!next-input-character;
2624 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2625 wakaba 1.1 redo A;
2626     } elsif ($self->{nc} == -1) {
2627 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2628     !!!cp (199);
2629     !!!parse-error (type => 'unclosed DOCTYPE');
2630    
2631     $self->{state} = DATA_STATE;
2632     $self->{s_kwd} = '';
2633     $self->{ct}->{quirks} = 1;
2634     } else {
2635     !!!parse-error (type => 'unclosed md'); ## TODO: type
2636     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2637     }
2638    
2639 wakaba 1.1 ## reconsume
2640 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2641 wakaba 1.1 redo A;
2642 wakaba 1.16 } elsif ($self->{is_xml} and
2643     $self->{ct}->{type} == DOCTYPE_TOKEN and
2644     $self->{nc} == 0x005B) { # [
2645 wakaba 1.12 !!!cp (200.1);
2646     !!!parse-error (type => 'no SYSTEM literal');
2647     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2648     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2649 wakaba 1.13 $self->{in_subset} = 1;
2650 wakaba 1.12 !!!next-input-character;
2651 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2652 wakaba 1.12 redo A;
2653 wakaba 1.1 } else {
2654     !!!parse-error (type => 'string after PUBLIC literal');
2655    
2656 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2657     !!!cp (200);
2658     $self->{ct}->{quirks} = 1;
2659     $self->{state} = BOGUS_DOCTYPE_STATE;
2660     } else {
2661     !!!cp (200.2);
2662     $self->{state} = BOGUS_MD_STATE;
2663     }
2664    
2665 wakaba 1.1 !!!next-input-character;
2666     redo A;
2667     }
2668     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2669     if ($is_space->{$self->{nc}}) {
2670     !!!cp (201);
2671     ## Stay in the state
2672     !!!next-input-character;
2673     redo A;
2674     } elsif ($self->{nc} == 0x0022) { # "
2675     !!!cp (202);
2676     $self->{ct}->{sysid} = ''; # DOCTYPE
2677     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2678     !!!next-input-character;
2679     redo A;
2680     } elsif ($self->{nc} == 0x0027) { # '
2681     !!!cp (203);
2682     $self->{ct}->{sysid} = ''; # DOCTYPE
2683     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2684     !!!next-input-character;
2685     redo A;
2686     } elsif ($self->{nc} == 0x003E) { # >
2687     !!!parse-error (type => 'no SYSTEM literal');
2688     !!!next-input-character;
2689    
2690 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2691     !!!cp (204);
2692     $self->{state} = DATA_STATE;
2693     $self->{s_kwd} = '';
2694     $self->{ct}->{quirks} = 1;
2695     } else {
2696     !!!cp (204.1);
2697     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698     }
2699 wakaba 1.1
2700 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2701 wakaba 1.1 redo A;
2702     } elsif ($self->{nc} == -1) {
2703 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2704     !!!cp (205);
2705     !!!parse-error (type => 'unclosed DOCTYPE');
2706     $self->{state} = DATA_STATE;
2707     $self->{s_kwd} = '';
2708     $self->{ct}->{quirks} = 1;
2709     } else {
2710     !!!cp (205.1);
2711     !!!parse-error (type => 'unclosed md'); ## TODO: type
2712     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713     }
2714    
2715 wakaba 1.1 ## reconsume
2716 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2717 wakaba 1.1 redo A;
2718 wakaba 1.16 } elsif ($self->{is_xml} and
2719     $self->{ct}->{type} == DOCTYPE_TOKEN and
2720     $self->{nc} == 0x005B) { # [
2721 wakaba 1.12 !!!cp (206.1);
2722     !!!parse-error (type => 'no SYSTEM literal');
2723    
2724     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2726 wakaba 1.13 $self->{in_subset} = 1;
2727 wakaba 1.12 !!!next-input-character;
2728 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2729 wakaba 1.12 redo A;
2730 wakaba 1.1 } else {
2731     !!!parse-error (type => 'string after SYSTEM');
2732    
2733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734     !!!cp (206);
2735     $self->{ct}->{quirks} = 1;
2736     $self->{state} = BOGUS_DOCTYPE_STATE;
2737     } else {
2738     !!!cp (206.2);
2739     $self->{state} = BOGUS_MD_STATE;
2740     }
2741    
2742 wakaba 1.1 !!!next-input-character;
2743     redo A;
2744     }
2745     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2746     if ($self->{nc} == 0x0022) { # "
2747     !!!cp (207);
2748     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2749     !!!next-input-character;
2750     redo A;
2751 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2752 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2753    
2754 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755     !!!cp (208);
2756     $self->{state} = DATA_STATE;
2757     $self->{s_kwd} = '';
2758     $self->{ct}->{quirks} = 1;
2759     } else {
2760     !!!cp (208.1);
2761     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2762     }
2763    
2764 wakaba 1.1 !!!next-input-character;
2765 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2766 wakaba 1.1 redo A;
2767     } elsif ($self->{nc} == -1) {
2768     !!!parse-error (type => 'unclosed SYSTEM literal');
2769    
2770 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2771     !!!cp (209);
2772     $self->{state} = DATA_STATE;
2773     $self->{s_kwd} = '';
2774     $self->{ct}->{quirks} = 1;
2775     } else {
2776     !!!cp (209.1);
2777     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2778     }
2779    
2780 wakaba 1.1 ## reconsume
2781 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2782 wakaba 1.1 redo A;
2783     } else {
2784     !!!cp (210);
2785 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2786 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2787     length $self->{ct}->{sysid});
2788    
2789     ## Stay in the state
2790     !!!next-input-character;
2791     redo A;
2792     }
2793     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2794     if ($self->{nc} == 0x0027) { # '
2795     !!!cp (211);
2796     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2797     !!!next-input-character;
2798     redo A;
2799 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2800 wakaba 1.1 !!!cp (212);
2801     !!!parse-error (type => 'unclosed SYSTEM literal');
2802    
2803     $self->{state} = DATA_STATE;
2804 wakaba 1.5 $self->{s_kwd} = '';
2805 wakaba 1.1 !!!next-input-character;
2806    
2807     $self->{ct}->{quirks} = 1;
2808     !!!emit ($self->{ct}); # DOCTYPE
2809    
2810     redo A;
2811     } elsif ($self->{nc} == -1) {
2812     !!!parse-error (type => 'unclosed SYSTEM literal');
2813    
2814 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815     !!!cp (213);
2816     $self->{state} = DATA_STATE;
2817     $self->{s_kwd} = '';
2818     $self->{ct}->{quirks} = 1;
2819     } else {
2820     !!!cp (213.1);
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     }
2823    
2824 wakaba 1.1 ## reconsume
2825 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2826 wakaba 1.1 redo A;
2827     } else {
2828     !!!cp (214);
2829 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2830 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2831     length $self->{ct}->{sysid});
2832    
2833     ## Stay in the state
2834     !!!next-input-character;
2835     redo A;
2836     }
2837     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838     if ($is_space->{$self->{nc}}) {
2839 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840     !!!cp (215.1);
2841     $self->{state} = BEFORE_NDATA_STATE;
2842     } else {
2843     !!!cp (215);
2844     ## Stay in the state
2845     }
2846 wakaba 1.1 !!!next-input-character;
2847     redo A;
2848     } elsif ($self->{nc} == 0x003E) { # >
2849 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850     !!!cp (216);
2851     $self->{state} = DATA_STATE;
2852     $self->{s_kwd} = '';
2853     } else {
2854     !!!cp (216.1);
2855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856     }
2857    
2858 wakaba 1.1 !!!next-input-character;
2859 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860 wakaba 1.1 redo A;
2861 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862     ($self->{nc} == 0x004E or # N
2863     $self->{nc} == 0x006E)) { # n
2864     !!!cp (216.2);
2865     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866     $self->{state} = NDATA_STATE;
2867     $self->{kwd} = chr $self->{nc};
2868     !!!next-input-character;
2869     redo A;
2870 wakaba 1.1 } elsif ($self->{nc} == -1) {
2871 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872     !!!cp (217);
2873     !!!parse-error (type => 'unclosed DOCTYPE');
2874     $self->{state} = DATA_STATE;
2875     $self->{s_kwd} = '';
2876     $self->{ct}->{quirks} = 1;
2877     } else {
2878     !!!cp (217.1);
2879     !!!parse-error (type => 'unclosed md'); ## TODO: type
2880     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2881     }
2882    
2883 wakaba 1.1 ## reconsume
2884 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2885 wakaba 1.1 redo A;
2886 wakaba 1.16 } elsif ($self->{is_xml} and
2887     $self->{ct}->{type} == DOCTYPE_TOKEN and
2888     $self->{nc} == 0x005B) { # [
2889 wakaba 1.12 !!!cp (218.1);
2890     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2892 wakaba 1.13 $self->{in_subset} = 1;
2893 wakaba 1.12 !!!next-input-character;
2894 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2895 wakaba 1.12 redo A;
2896 wakaba 1.1 } else {
2897     !!!parse-error (type => 'string after SYSTEM literal');
2898    
2899 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900     !!!cp (218);
2901     #$self->{ct}->{quirks} = 1;
2902     $self->{state} = BOGUS_DOCTYPE_STATE;
2903     } else {
2904     !!!cp (218.2);
2905     $self->{state} = BOGUS_MD_STATE;
2906     }
2907    
2908 wakaba 1.1 !!!next-input-character;
2909     redo A;
2910     }
2911 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912     if ($is_space->{$self->{nc}}) {
2913     !!!cp (218.3);
2914     ## Stay in the state.
2915     !!!next-input-character;
2916     redo A;
2917     } elsif ($self->{nc} == 0x003E) { # >
2918     !!!cp (218.4);
2919     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920     !!!next-input-character;
2921     !!!emit ($self->{ct}); # ENTITY
2922     redo A;
2923     } elsif ($self->{nc} == 0x004E or # N
2924     $self->{nc} == 0x006E) { # n
2925     !!!cp (218.5);
2926     $self->{state} = NDATA_STATE;
2927     $self->{kwd} = chr $self->{nc};
2928     !!!next-input-character;
2929     redo A;
2930     } elsif ($self->{nc} == -1) {
2931     !!!cp (218.6);
2932     !!!parse-error (type => 'unclosed md'); ## TODO: type
2933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934     ## reconsume
2935     !!!emit ($self->{ct}); # ENTITY
2936     redo A;
2937     } else {
2938     !!!cp (218.7);
2939     !!!parse-error (type => 'string after SYSTEM literal');
2940     $self->{state} = BOGUS_MD_STATE;
2941     !!!next-input-character;
2942     redo A;
2943     }
2944 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2945     if ($self->{nc} == 0x003E) { # >
2946     !!!cp (219);
2947     $self->{state} = DATA_STATE;
2948 wakaba 1.5 $self->{s_kwd} = '';
2949 wakaba 1.1 !!!next-input-character;
2950    
2951     !!!emit ($self->{ct}); # DOCTYPE
2952    
2953     redo A;
2954 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2955 wakaba 1.13 !!!cp (220.1);
2956     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2957     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2958     $self->{in_subset} = 1;
2959     !!!next-input-character;
2960     !!!emit ($self->{ct}); # DOCTYPE
2961     redo A;
2962 wakaba 1.1 } elsif ($self->{nc} == -1) {
2963     !!!cp (220);
2964     $self->{state} = DATA_STATE;
2965 wakaba 1.5 $self->{s_kwd} = '';
2966 wakaba 1.1 ## reconsume
2967    
2968     !!!emit ($self->{ct}); # DOCTYPE
2969    
2970     redo A;
2971     } else {
2972     !!!cp (221);
2973     my $s = '';
2974 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2975 wakaba 1.1
2976     ## Stay in the state
2977     !!!next-input-character;
2978     redo A;
2979     }
2980     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2981     ## NOTE: "CDATA section state" in the state is jointly implemented
2982     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2983     ## and |CDATA_SECTION_MSE2_STATE|.
2984 wakaba 1.10
2985     ## XML5: "CDATA state".
2986 wakaba 1.1
2987     if ($self->{nc} == 0x005D) { # ]
2988     !!!cp (221.1);
2989     $self->{state} = CDATA_SECTION_MSE1_STATE;
2990     !!!next-input-character;
2991     redo A;
2992     } elsif ($self->{nc} == -1) {
2993 wakaba 1.6 if ($self->{is_xml}) {
2994 wakaba 1.8 !!!cp (221.11);
2995 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2996 wakaba 1.8 } else {
2997     !!!cp (221.12);
2998 wakaba 1.6 }
2999    
3000 wakaba 1.1 $self->{state} = DATA_STATE;
3001 wakaba 1.5 $self->{s_kwd} = '';
3002 wakaba 1.10 ## Reconsume.
3003 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3004     !!!cp (221.2);
3005     !!!emit ($self->{ct}); # character
3006     } else {
3007     !!!cp (221.3);
3008     ## No token to emit. $self->{ct} is discarded.
3009     }
3010     redo A;
3011     } else {
3012     !!!cp (221.4);
3013     $self->{ct}->{data} .= chr $self->{nc};
3014     $self->{read_until}->($self->{ct}->{data},
3015     q<]>,
3016     length $self->{ct}->{data});
3017    
3018     ## Stay in the state.
3019     !!!next-input-character;
3020     redo A;
3021     }
3022    
3023     ## ISSUE: "text tokens" in spec.
3024     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3025 wakaba 1.10 ## XML5: "CDATA bracket state".
3026    
3027 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3028     !!!cp (221.5);
3029     $self->{state} = CDATA_SECTION_MSE2_STATE;
3030     !!!next-input-character;
3031     redo A;
3032     } else {
3033     !!!cp (221.6);
3034 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3035 wakaba 1.1 $self->{ct}->{data} .= ']';
3036 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3037 wakaba 1.1 ## Reconsume.
3038     redo A;
3039     }
3040     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3041 wakaba 1.10 ## XML5: "CDATA end state".
3042    
3043 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3044     $self->{state} = DATA_STATE;
3045 wakaba 1.5 $self->{s_kwd} = '';
3046 wakaba 1.1 !!!next-input-character;
3047     if (length $self->{ct}->{data}) { # character
3048     !!!cp (221.7);
3049     !!!emit ($self->{ct}); # character
3050     } else {
3051     !!!cp (221.8);
3052     ## No token to emit. $self->{ct} is discarded.
3053     }
3054     redo A;
3055     } elsif ($self->{nc} == 0x005D) { # ]
3056     !!!cp (221.9); # character
3057     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3058     ## Stay in the state.
3059     !!!next-input-character;
3060     redo A;
3061     } else {
3062     !!!cp (221.11);
3063     $self->{ct}->{data} .= ']]'; # character
3064     $self->{state} = CDATA_SECTION_STATE;
3065 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3066 wakaba 1.1 redo A;
3067     }
3068     } elsif ($self->{state} == ENTITY_STATE) {
3069     if ($is_space->{$self->{nc}} or
3070     {
3071     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3072     $self->{entity_add} => 1,
3073     }->{$self->{nc}}) {
3074     !!!cp (1001);
3075     ## Don't consume
3076     ## No error
3077     ## Return nothing.
3078     #
3079     } elsif ($self->{nc} == 0x0023) { # #
3080     !!!cp (999);
3081     $self->{state} = ENTITY_HASH_STATE;
3082 wakaba 1.12 $self->{kwd} = '#';
3083 wakaba 1.1 !!!next-input-character;
3084     redo A;
3085     } elsif ((0x0041 <= $self->{nc} and
3086     $self->{nc} <= 0x005A) or # A..Z
3087     (0x0061 <= $self->{nc} and
3088     $self->{nc} <= 0x007A)) { # a..z
3089     !!!cp (998);
3090     require Whatpm::_NamedEntityList;
3091     $self->{state} = ENTITY_NAME_STATE;
3092 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3093     $self->{entity__value} = $self->{kwd};
3094 wakaba 1.1 $self->{entity__match} = 0;
3095     !!!next-input-character;
3096     redo A;
3097     } else {
3098     !!!cp (1027);
3099     !!!parse-error (type => 'bare ero');
3100     ## Return nothing.
3101     #
3102     }
3103    
3104     ## NOTE: No character is consumed by the "consume a character
3105     ## reference" algorithm. In other word, there is an "&" character
3106     ## that does not introduce a character reference, which would be
3107     ## appended to the parent element or the attribute value in later
3108     ## process of the tokenizer.
3109    
3110     if ($self->{prev_state} == DATA_STATE) {
3111     !!!cp (997);
3112     $self->{state} = $self->{prev_state};
3113 wakaba 1.5 $self->{s_kwd} = '';
3114 wakaba 1.1 ## Reconsume.
3115     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3116     line => $self->{line_prev},
3117     column => $self->{column_prev},
3118     });
3119     redo A;
3120     } else {
3121     !!!cp (996);
3122     $self->{ca}->{value} .= '&';
3123     $self->{state} = $self->{prev_state};
3124 wakaba 1.5 $self->{s_kwd} = '';
3125 wakaba 1.1 ## Reconsume.
3126     redo A;
3127     }
3128     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3129     if ($self->{nc} == 0x0078 or # x
3130     $self->{nc} == 0x0058) { # X
3131     !!!cp (995);
3132     $self->{state} = HEXREF_X_STATE;
3133 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3134 wakaba 1.1 !!!next-input-character;
3135     redo A;
3136     } elsif (0x0030 <= $self->{nc} and
3137     $self->{nc} <= 0x0039) { # 0..9
3138     !!!cp (994);
3139     $self->{state} = NCR_NUM_STATE;
3140 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3141 wakaba 1.1 !!!next-input-character;
3142     redo A;
3143     } else {
3144     !!!parse-error (type => 'bare nero',
3145     line => $self->{line_prev},
3146     column => $self->{column_prev} - 1);
3147    
3148     ## NOTE: According to the spec algorithm, nothing is returned,
3149     ## and then "&#" is appended to the parent element or the attribute
3150     ## value in the later processing.
3151    
3152     if ($self->{prev_state} == DATA_STATE) {
3153     !!!cp (1019);
3154     $self->{state} = $self->{prev_state};
3155 wakaba 1.5 $self->{s_kwd} = '';
3156 wakaba 1.1 ## Reconsume.
3157     !!!emit ({type => CHARACTER_TOKEN,
3158     data => '&#',
3159     line => $self->{line_prev},
3160     column => $self->{column_prev} - 1,
3161     });
3162     redo A;
3163     } else {
3164     !!!cp (993);
3165     $self->{ca}->{value} .= '&#';
3166     $self->{state} = $self->{prev_state};
3167 wakaba 1.5 $self->{s_kwd} = '';
3168 wakaba 1.1 ## Reconsume.
3169     redo A;
3170     }
3171     }
3172     } elsif ($self->{state} == NCR_NUM_STATE) {
3173     if (0x0030 <= $self->{nc} and
3174     $self->{nc} <= 0x0039) { # 0..9
3175     !!!cp (1012);
3176 wakaba 1.12 $self->{kwd} *= 10;
3177     $self->{kwd} += $self->{nc} - 0x0030;
3178 wakaba 1.1
3179     ## Stay in the state.
3180     !!!next-input-character;
3181     redo A;
3182     } elsif ($self->{nc} == 0x003B) { # ;
3183     !!!cp (1013);
3184     !!!next-input-character;
3185     #
3186     } else {
3187     !!!cp (1014);
3188     !!!parse-error (type => 'no refc');
3189     ## Reconsume.
3190     #
3191     }
3192    
3193 wakaba 1.12 my $code = $self->{kwd};
3194 wakaba 1.1 my $l = $self->{line_prev};
3195     my $c = $self->{column_prev};
3196     if ($charref_map->{$code}) {
3197     !!!cp (1015);
3198     !!!parse-error (type => 'invalid character reference',
3199     text => (sprintf 'U+%04X', $code),
3200     line => $l, column => $c);
3201     $code = $charref_map->{$code};
3202     } elsif ($code > 0x10FFFF) {
3203     !!!cp (1016);
3204     !!!parse-error (type => 'invalid character reference',
3205     text => (sprintf 'U-%08X', $code),
3206     line => $l, column => $c);
3207     $code = 0xFFFD;
3208     }
3209    
3210     if ($self->{prev_state} == DATA_STATE) {
3211     !!!cp (992);
3212     $self->{state} = $self->{prev_state};
3213 wakaba 1.5 $self->{s_kwd} = '';
3214 wakaba 1.1 ## Reconsume.
3215     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3216 wakaba 1.7 has_reference => 1,
3217 wakaba 1.1 line => $l, column => $c,
3218     });
3219     redo A;
3220     } else {
3221     !!!cp (991);
3222     $self->{ca}->{value} .= chr $code;
3223     $self->{ca}->{has_reference} = 1;
3224     $self->{state} = $self->{prev_state};
3225 wakaba 1.5 $self->{s_kwd} = '';
3226 wakaba 1.1 ## Reconsume.
3227     redo A;
3228     }
3229     } elsif ($self->{state} == HEXREF_X_STATE) {
3230     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3231     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3232     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3233     # 0..9, A..F, a..f
3234     !!!cp (990);
3235     $self->{state} = HEXREF_HEX_STATE;
3236 wakaba 1.12 $self->{kwd} = 0;
3237 wakaba 1.1 ## Reconsume.
3238     redo A;
3239     } else {
3240     !!!parse-error (type => 'bare hcro',
3241     line => $self->{line_prev},
3242     column => $self->{column_prev} - 2);
3243    
3244     ## NOTE: According to the spec algorithm, nothing is returned,
3245     ## and then "&#" followed by "X" or "x" is appended to the parent
3246     ## element or the attribute value in the later processing.
3247    
3248     if ($self->{prev_state} == DATA_STATE) {
3249     !!!cp (1005);
3250     $self->{state} = $self->{prev_state};
3251 wakaba 1.5 $self->{s_kwd} = '';
3252 wakaba 1.1 ## Reconsume.
3253     !!!emit ({type => CHARACTER_TOKEN,
3254 wakaba 1.12 data => '&' . $self->{kwd},
3255 wakaba 1.1 line => $self->{line_prev},
3256 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3257 wakaba 1.1 });
3258     redo A;
3259     } else {
3260     !!!cp (989);
3261 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3262 wakaba 1.1 $self->{state} = $self->{prev_state};
3263 wakaba 1.5 $self->{s_kwd} = '';
3264 wakaba 1.1 ## Reconsume.
3265     redo A;
3266     }
3267     }
3268     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3269     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3270     # 0..9
3271     !!!cp (1002);
3272 wakaba 1.12 $self->{kwd} *= 0x10;
3273     $self->{kwd} += $self->{nc} - 0x0030;
3274 wakaba 1.1 ## Stay in the state.
3275     !!!next-input-character;
3276     redo A;
3277     } elsif (0x0061 <= $self->{nc} and
3278     $self->{nc} <= 0x0066) { # a..f
3279     !!!cp (1003);
3280 wakaba 1.12 $self->{kwd} *= 0x10;
3281     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3282 wakaba 1.1 ## Stay in the state.
3283     !!!next-input-character;
3284     redo A;
3285     } elsif (0x0041 <= $self->{nc} and
3286     $self->{nc} <= 0x0046) { # A..F
3287     !!!cp (1004);
3288 wakaba 1.12 $self->{kwd} *= 0x10;
3289     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3290 wakaba 1.1 ## Stay in the state.
3291     !!!next-input-character;
3292     redo A;
3293     } elsif ($self->{nc} == 0x003B) { # ;
3294     !!!cp (1006);
3295     !!!next-input-character;
3296     #
3297     } else {
3298     !!!cp (1007);
3299     !!!parse-error (type => 'no refc',
3300     line => $self->{line},
3301     column => $self->{column});
3302     ## Reconsume.
3303     #
3304     }
3305    
3306 wakaba 1.12 my $code = $self->{kwd};
3307 wakaba 1.1 my $l = $self->{line_prev};
3308     my $c = $self->{column_prev};
3309     if ($charref_map->{$code}) {
3310     !!!cp (1008);
3311     !!!parse-error (type => 'invalid character reference',
3312     text => (sprintf 'U+%04X', $code),
3313     line => $l, column => $c);
3314     $code = $charref_map->{$code};
3315     } elsif ($code > 0x10FFFF) {
3316     !!!cp (1009);
3317     !!!parse-error (type => 'invalid character reference',
3318     text => (sprintf 'U-%08X', $code),
3319     line => $l, column => $c);
3320     $code = 0xFFFD;
3321     }
3322    
3323     if ($self->{prev_state} == DATA_STATE) {
3324     !!!cp (988);
3325     $self->{state} = $self->{prev_state};
3326 wakaba 1.5 $self->{s_kwd} = '';
3327 wakaba 1.1 ## Reconsume.
3328     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3329 wakaba 1.7 has_reference => 1,
3330 wakaba 1.1 line => $l, column => $c,
3331     });
3332     redo A;
3333     } else {
3334     !!!cp (987);
3335     $self->{ca}->{value} .= chr $code;
3336     $self->{ca}->{has_reference} = 1;
3337     $self->{state} = $self->{prev_state};
3338 wakaba 1.5 $self->{s_kwd} = '';
3339 wakaba 1.1 ## Reconsume.
3340     redo A;
3341     }
3342     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343 wakaba 1.12 if (length $self->{kwd} < 30 and
3344 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3345     ((0x0041 <= $self->{nc} and # a
3346     $self->{nc} <= 0x005A) or # x
3347     (0x0061 <= $self->{nc} and # a
3348     $self->{nc} <= 0x007A) or # z
3349     (0x0030 <= $self->{nc} and # 0
3350     $self->{nc} <= 0x0039) or # 9
3351     $self->{nc} == 0x003B)) { # ;
3352     our $EntityChar;
3353 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3354     if (defined $EntityChar->{$self->{kwd}}) {
3355 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3356     !!!cp (1020);
3357 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3358 wakaba 1.1 $self->{entity__match} = 1;
3359     !!!next-input-character;
3360     #
3361     } else {
3362     !!!cp (1021);
3363 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3364 wakaba 1.1 $self->{entity__match} = -1;
3365     ## Stay in the state.
3366     !!!next-input-character;
3367     redo A;
3368     }
3369     } else {
3370     !!!cp (1022);
3371     $self->{entity__value} .= chr $self->{nc};
3372     $self->{entity__match} *= 2;
3373     ## Stay in the state.
3374     !!!next-input-character;
3375     redo A;
3376     }
3377     }
3378    
3379     my $data;
3380     my $has_ref;
3381     if ($self->{entity__match} > 0) {
3382     !!!cp (1023);
3383     $data = $self->{entity__value};
3384     $has_ref = 1;
3385     #
3386     } elsif ($self->{entity__match} < 0) {
3387     !!!parse-error (type => 'no refc');
3388     if ($self->{prev_state} != DATA_STATE and # in attribute
3389     $self->{entity__match} < -1) {
3390     !!!cp (1024);
3391 wakaba 1.12 $data = '&' . $self->{kwd};
3392 wakaba 1.1 #
3393     } else {
3394     !!!cp (1025);
3395     $data = $self->{entity__value};
3396     $has_ref = 1;
3397     #
3398     }
3399     } else {
3400     !!!cp (1026);
3401     !!!parse-error (type => 'bare ero',
3402     line => $self->{line_prev},
3403 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3404     $data = '&' . $self->{kwd};
3405 wakaba 1.1 #
3406     }
3407    
3408     ## NOTE: In these cases, when a character reference is found,
3409     ## it is consumed and a character token is returned, or, otherwise,
3410     ## nothing is consumed and returned, according to the spec algorithm.
3411     ## In this implementation, anything that has been examined by the
3412     ## tokenizer is appended to the parent element or the attribute value
3413     ## as string, either literal string when no character reference or
3414     ## entity-replaced string otherwise, in this stage, since any characters
3415     ## that would not be consumed are appended in the data state or in an
3416     ## appropriate attribute value state anyway.
3417    
3418     if ($self->{prev_state} == DATA_STATE) {
3419     !!!cp (986);
3420     $self->{state} = $self->{prev_state};
3421 wakaba 1.5 $self->{s_kwd} = '';
3422 wakaba 1.1 ## Reconsume.
3423     !!!emit ({type => CHARACTER_TOKEN,
3424     data => $data,
3425 wakaba 1.7 has_reference => $has_ref,
3426 wakaba 1.1 line => $self->{line_prev},
3427 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3428 wakaba 1.1 });
3429     redo A;
3430     } else {
3431     !!!cp (985);
3432     $self->{ca}->{value} .= $data;
3433     $self->{ca}->{has_reference} = 1 if $has_ref;
3434     $self->{state} = $self->{prev_state};
3435 wakaba 1.5 $self->{s_kwd} = '';
3436 wakaba 1.1 ## Reconsume.
3437     redo A;
3438     }
3439 wakaba 1.8
3440     ## XML-only states
3441    
3442     } elsif ($self->{state} == PI_STATE) {
3443 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3444    
3445 wakaba 1.8 if ($is_space->{$self->{nc}} or
3446 wakaba 1.14 $self->{nc} == 0x003F or # ?
3447 wakaba 1.8 $self->{nc} == -1) {
3448 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3449     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3450     ## "DOCTYPE pi state": Parse error, switch to the "data
3451     ## state".
3452 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3453     line => $self->{line_prev},
3454     column => $self->{column_prev}
3455     - 1 * ($self->{nc} != -1));
3456     $self->{state} = BOGUS_COMMENT_STATE;
3457     ## Reconsume.
3458     $self->{ct} = {type => COMMENT_TOKEN,
3459     data => '?',
3460     line => $self->{line_prev},
3461     column => $self->{column_prev}
3462     - 1 * ($self->{nc} != -1),
3463     };
3464     redo A;
3465     } else {
3466 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3467 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3468     target => chr $self->{nc},
3469     data => '',
3470     line => $self->{line_prev},
3471     column => $self->{column_prev} - 1,
3472     };
3473     $self->{state} = PI_TARGET_STATE;
3474     !!!next-input-character;
3475     redo A;
3476     }
3477     } elsif ($self->{state} == PI_TARGET_STATE) {
3478     if ($is_space->{$self->{nc}}) {
3479     $self->{state} = PI_TARGET_AFTER_STATE;
3480     !!!next-input-character;
3481     redo A;
3482     } elsif ($self->{nc} == -1) {
3483     !!!parse-error (type => 'no pic'); ## TODO: type
3484 wakaba 1.13 if ($self->{in_subset}) {
3485     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3486     } else {
3487     $self->{state} = DATA_STATE;
3488     $self->{s_kwd} = '';
3489     }
3490 wakaba 1.8 ## Reconsume.
3491     !!!emit ($self->{ct}); # pi
3492     redo A;
3493     } elsif ($self->{nc} == 0x003F) { # ?
3494     $self->{state} = PI_AFTER_STATE;
3495     !!!next-input-character;
3496     redo A;
3497     } else {
3498     ## XML5: typo ("tag name" -> "target")
3499     $self->{ct}->{target} .= chr $self->{nc}; # pi
3500     !!!next-input-character;
3501     redo A;
3502     }
3503     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3504     if ($is_space->{$self->{nc}}) {
3505     ## Stay in the state.
3506     !!!next-input-character;
3507     redo A;
3508     } else {
3509     $self->{state} = PI_DATA_STATE;
3510     ## Reprocess.
3511     redo A;
3512     }
3513     } elsif ($self->{state} == PI_DATA_STATE) {
3514     if ($self->{nc} == 0x003F) { # ?
3515     $self->{state} = PI_DATA_AFTER_STATE;
3516     !!!next-input-character;
3517     redo A;
3518     } elsif ($self->{nc} == -1) {
3519     !!!parse-error (type => 'no pic'); ## TODO: type
3520 wakaba 1.13 if ($self->{in_subset}) {
3521 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3522 wakaba 1.13 } else {
3523     $self->{state} = DATA_STATE;
3524     $self->{s_kwd} = '';
3525     }
3526 wakaba 1.8 ## Reprocess.
3527     !!!emit ($self->{ct}); # pi
3528     redo A;
3529     } else {
3530     $self->{ct}->{data} .= chr $self->{nc}; # pi
3531     $self->{read_until}->($self->{ct}->{data}, q[?],
3532     length $self->{ct}->{data});
3533     ## Stay in the state.
3534     !!!next-input-character;
3535     ## Reprocess.
3536     redo A;
3537     }
3538     } elsif ($self->{state} == PI_AFTER_STATE) {
3539 wakaba 1.14 ## XML5: Part of "Pi after state".
3540    
3541 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3542 wakaba 1.13 if ($self->{in_subset}) {
3543     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3544     } else {
3545     $self->{state} = DATA_STATE;
3546     $self->{s_kwd} = '';
3547     }
3548 wakaba 1.8 !!!next-input-character;
3549     !!!emit ($self->{ct}); # pi
3550     redo A;
3551     } elsif ($self->{nc} == 0x003F) { # ?
3552     !!!parse-error (type => 'no s after target', ## TODO: type
3553     line => $self->{line_prev},
3554     column => $self->{column_prev}); ## XML5: no error
3555     $self->{ct}->{data} .= '?';
3556     $self->{state} = PI_DATA_AFTER_STATE;
3557     !!!next-input-character;
3558     redo A;
3559     } else {
3560     !!!parse-error (type => 'no s after target', ## TODO: type
3561     line => $self->{line_prev},
3562     column => $self->{column_prev}
3563     + 1 * ($self->{nc} == -1)); ## XML5: no error
3564     $self->{ct}->{data} .= '?'; ## XML5: not appended
3565     $self->{state} = PI_DATA_STATE;
3566     ## Reprocess.
3567     redo A;
3568     }
3569     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3570 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3571    
3572 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3573 wakaba 1.13 if ($self->{in_subset}) {
3574     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3575     } else {
3576     $self->{state} = DATA_STATE;
3577     $self->{s_kwd} = '';
3578     }
3579 wakaba 1.8 !!!next-input-character;
3580     !!!emit ($self->{ct}); # pi
3581     redo A;
3582     } elsif ($self->{nc} == 0x003F) { # ?
3583     $self->{ct}->{data} .= '?';
3584     ## Stay in the state.
3585     !!!next-input-character;
3586     redo A;
3587     } else {
3588     $self->{ct}->{data} .= '?'; ## XML5: not appended
3589     $self->{state} = PI_DATA_STATE;
3590     ## Reprocess.
3591     redo A;
3592     }
3593 wakaba 1.12
3594     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3595     if ($self->{nc} == 0x003C) { # <
3596 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3597 wakaba 1.12 !!!next-input-character;
3598     redo A;
3599     } elsif ($self->{nc} == 0x0025) { # %
3600     ## XML5: Not defined yet.
3601    
3602     ## TODO:
3603     !!!next-input-character;
3604     redo A;
3605     } elsif ($self->{nc} == 0x005D) { # ]
3606 wakaba 1.13 delete $self->{in_subset};
3607 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3608     !!!next-input-character;
3609     redo A;
3610     } elsif ($is_space->{$self->{nc}}) {
3611     ## Stay in the state.
3612     !!!next-input-character;
3613     redo A;
3614     } elsif ($self->{nc} == -1) {
3615     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3616 wakaba 1.13 delete $self->{in_subset};
3617 wakaba 1.12 $self->{state} = DATA_STATE;
3618     $self->{s_kwd} = '';
3619     ## Reconsume.
3620 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3621 wakaba 1.12 redo A;
3622     } else {
3623     unless ($self->{internal_subset_tainted}) {
3624     ## XML5: No parse error.
3625     !!!parse-error (type => 'string in internal subset');
3626     $self->{internal_subset_tainted} = 1;
3627     }
3628     ## Stay in the state.
3629     !!!next-input-character;
3630     redo A;
3631     }
3632     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3633     if ($self->{nc} == 0x003E) { # >
3634     $self->{state} = DATA_STATE;
3635     $self->{s_kwd} = '';
3636     !!!next-input-character;
3637 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3638 wakaba 1.12 redo A;
3639     } elsif ($self->{nc} == -1) {
3640     !!!parse-error (type => 'unclosed DOCTYPE');
3641     $self->{state} = DATA_STATE;
3642     $self->{s_kwd} = '';
3643     ## Reconsume.
3644 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3645 wakaba 1.12 redo A;
3646     } else {
3647     ## XML5: No parse error and stay in the state.
3648     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3649    
3650 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3651     !!!next-input-character;
3652     redo A;
3653     }
3654     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3655     if ($self->{nc} == 0x003E) { # >
3656     $self->{state} = DATA_STATE;
3657     $self->{s_kwd} = '';
3658     !!!next-input-character;
3659     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3660     redo A;
3661     } elsif ($self->{nc} == -1) {
3662     $self->{state} = DATA_STATE;
3663     $self->{s_kwd} = '';
3664     ## Reconsume.
3665     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3666     redo A;
3667     } else {
3668     ## Stay in the state.
3669     !!!next-input-character;
3670     redo A;
3671     }
3672     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3673     if ($self->{nc} == 0x0021) { # !
3674 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3675 wakaba 1.13 !!!next-input-character;
3676     redo A;
3677     } elsif ($self->{nc} == 0x003F) { # ?
3678     $self->{state} = PI_STATE;
3679     !!!next-input-character;
3680     redo A;
3681     } elsif ($self->{nc} == -1) {
3682     !!!parse-error (type => 'bare stago');
3683     $self->{state} = DATA_STATE;
3684     $self->{s_kwd} = '';
3685     ## Reconsume.
3686     redo A;
3687     } else {
3688     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3689     line => $self->{line_prev},
3690     column => $self->{column_prev});
3691     $self->{state} = BOGUS_COMMENT_STATE;
3692     $self->{ct} = {type => COMMENT_TOKEN,
3693     data => '',
3694     }; ## NOTE: Will be discarded.
3695 wakaba 1.12 !!!next-input-character;
3696     redo A;
3697     }
3698 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3699     ## XML5: "DOCTYPE markup declaration state".
3700    
3701     if ($self->{nc} == 0x002D) { # -
3702     $self->{state} = MD_HYPHEN_STATE;
3703     !!!next-input-character;
3704     redo A;
3705 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3706     $self->{nc} == 0x0065) { # e
3707 wakaba 1.14 $self->{state} = MD_E_STATE;
3708     $self->{kwd} = chr $self->{nc};
3709     !!!next-input-character;
3710     redo A;
3711 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3712     $self->{nc} == 0x0061) { # a
3713 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3714     $self->{kwd} = chr $self->{nc};
3715     !!!next-input-character;
3716     redo A;
3717 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3718     $self->{nc} == 0x006E) { # n
3719 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3720     $self->{kwd} = chr $self->{nc};
3721     !!!next-input-character;
3722     redo A;
3723     } else {
3724     #
3725     }
3726    
3727     ## XML5: No parse error.
3728     !!!parse-error (type => 'bogus comment',
3729     line => $self->{line_prev},
3730     column => $self->{column_prev} - 1);
3731     ## Reconsume.
3732     $self->{state} = BOGUS_COMMENT_STATE;
3733     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734     redo A;
3735     } elsif ($self->{state} == MD_E_STATE) {
3736 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3737     $self->{nc} == 0x006E) { # n
3738 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3739     $self->{kwd} .= chr $self->{nc};
3740     !!!next-input-character;
3741     redo A;
3742 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3743     $self->{nc} == 0x006C) { # l
3744 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3745     $self->{state} = MD_ELEMENT_STATE;
3746     $self->{kwd} .= chr $self->{nc};
3747     !!!next-input-character;
3748     redo A;
3749     } else {
3750     ## XML5: No parse error.
3751     !!!parse-error (type => 'bogus comment',
3752     line => $self->{line_prev},
3753     column => $self->{column_prev} - 2
3754     + 1 * ($self->{nc} == -1));
3755     ## Reconsume.
3756     $self->{state} = BOGUS_COMMENT_STATE;
3757     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3758     redo A;
3759     }
3760     } elsif ($self->{state} == MD_ENTITY_STATE) {
3761 wakaba 1.17 if ($self->{nc} == [
3762     undef,
3763     undef,
3764     0x0054, # T
3765     0x0049, # I
3766     0x0054, # T
3767     ]->[length $self->{kwd}] or
3768     $self->{nc} == [
3769     undef,
3770     undef,
3771     0x0074, # t
3772     0x0069, # i
3773     0x0074, # t
3774     ]->[length $self->{kwd}]) {
3775 wakaba 1.14 ## Stay in the state.
3776     $self->{kwd} .= chr $self->{nc};
3777     !!!next-input-character;
3778     redo A;
3779 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3780     ($self->{nc} == 0x0059 or # Y
3781     $self->{nc} == 0x0079)) { # y
3782     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784     text => 'ENTITY',
3785     line => $self->{line_prev},
3786     column => $self->{column_prev} - 4);
3787     }
3788     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789 wakaba 1.14 line => $self->{line_prev},
3790     column => $self->{column_prev} - 6};
3791     $self->{state} = DOCTYPE_MD_STATE;
3792     !!!next-input-character;
3793     redo A;
3794     } else {
3795     !!!parse-error (type => 'bogus comment',
3796     line => $self->{line_prev},
3797     column => $self->{column_prev} - 1
3798     - (length $self->{kwd})
3799     + 1 * ($self->{nc} == -1));
3800     $self->{state} = BOGUS_COMMENT_STATE;
3801     ## Reconsume.
3802     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3803     redo A;
3804     }
3805     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806 wakaba 1.17 if ($self->{nc} == [
3807     undef,
3808     undef,
3809     0x0045, # E
3810     0x004D, # M
3811     0x0045, # E
3812     0x004E, # N
3813     ]->[length $self->{kwd}] or
3814     $self->{nc} == [
3815     undef,
3816     undef,
3817     0x0065, # e
3818     0x006D, # m
3819     0x0065, # e
3820     0x006E, # n
3821     ]->[length $self->{kwd}]) {
3822 wakaba 1.14 ## Stay in the state.
3823     $self->{kwd} .= chr $self->{nc};
3824     !!!next-input-character;
3825     redo A;
3826 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3827     ($self->{nc} == 0x0054 or # T
3828     $self->{nc} == 0x0074)) { # t
3829     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831     text => 'ELEMENT',
3832     line => $self->{line_prev},
3833     column => $self->{column_prev} - 5);
3834     }
3835 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836     line => $self->{line_prev},
3837     column => $self->{column_prev} - 6};
3838     $self->{state} = DOCTYPE_MD_STATE;
3839     !!!next-input-character;
3840     redo A;
3841     } else {
3842     !!!parse-error (type => 'bogus comment',
3843     line => $self->{line_prev},
3844     column => $self->{column_prev} - 1
3845     - (length $self->{kwd})
3846     + 1 * ($self->{nc} == -1));
3847     $self->{state} = BOGUS_COMMENT_STATE;
3848     ## Reconsume.
3849     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850     redo A;
3851     }
3852     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853 wakaba 1.17 if ($self->{nc} == [
3854     undef,
3855     0x0054, # T
3856     0x0054, # T
3857     0x004C, # L
3858     0x0049, # I
3859     0x0053, # S
3860     ]->[length $self->{kwd}] or
3861     $self->{nc} == [
3862     undef,
3863     0x0074, # t
3864     0x0074, # t
3865     0x006C, # l
3866     0x0069, # i
3867     0x0073, # s
3868     ]->[length $self->{kwd}]) {
3869 wakaba 1.14 ## Stay in the state.
3870     $self->{kwd} .= chr $self->{nc};
3871     !!!next-input-character;
3872     redo A;
3873 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3874     ($self->{nc} == 0x0054 or # T
3875     $self->{nc} == 0x0074)) { # t
3876     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878     text => 'ATTLIST',
3879     line => $self->{line_prev},
3880     column => $self->{column_prev} - 5);
3881     }
3882 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883 wakaba 1.15 attrdefs => [],
3884 wakaba 1.14 line => $self->{line_prev},
3885     column => $self->{column_prev} - 6};
3886     $self->{state} = DOCTYPE_MD_STATE;
3887     !!!next-input-character;
3888     redo A;
3889     } else {
3890     !!!parse-error (type => 'bogus comment',
3891     line => $self->{line_prev},
3892     column => $self->{column_prev} - 1
3893     - (length $self->{kwd})
3894     + 1 * ($self->{nc} == -1));
3895     $self->{state} = BOGUS_COMMENT_STATE;
3896     ## Reconsume.
3897     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3898     redo A;
3899     }
3900     } elsif ($self->{state} == MD_NOTATION_STATE) {
3901 wakaba 1.17 if ($self->{nc} == [
3902     undef,
3903     0x004F, # O
3904     0x0054, # T
3905     0x0041, # A
3906     0x0054, # T
3907     0x0049, # I
3908     0x004F, # O
3909     ]->[length $self->{kwd}] or
3910     $self->{nc} == [
3911     undef,
3912     0x006F, # o
3913     0x0074, # t
3914     0x0061, # a
3915     0x0074, # t
3916     0x0069, # i
3917     0x006F, # o
3918     ]->[length $self->{kwd}]) {
3919 wakaba 1.14 ## Stay in the state.
3920     $self->{kwd} .= chr $self->{nc};
3921     !!!next-input-character;
3922     redo A;
3923 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
3924     ($self->{nc} == 0x004E or # N
3925     $self->{nc} == 0x006E)) { # n
3926     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928     text => 'NOTATION',
3929     line => $self->{line_prev},
3930     column => $self->{column_prev} - 6);
3931     }
3932 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933     line => $self->{line_prev},
3934     column => $self->{column_prev} - 6};
3935     $self->{state} = DOCTYPE_MD_STATE;
3936     !!!next-input-character;
3937     redo A;
3938     } else {
3939     !!!parse-error (type => 'bogus comment',
3940     line => $self->{line_prev},
3941     column => $self->{column_prev} - 1
3942     - (length $self->{kwd})
3943     + 1 * ($self->{nc} == -1));
3944     $self->{state} = BOGUS_COMMENT_STATE;
3945     ## Reconsume.
3946     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3947     redo A;
3948     }
3949     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3950     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3951     ## "DOCTYPE NOTATION state".
3952    
3953     if ($is_space->{$self->{nc}}) {
3954     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3955     $self->{state} = BEFORE_MD_NAME_STATE;
3956     !!!next-input-character;
3957     redo A;
3958     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3959     $self->{nc} == 0x0025) { # %
3960     ## XML5: Switch to the "DOCTYPE bogus comment state".
3961     !!!parse-error (type => 'no space before md name'); ## TODO: type
3962     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3963     !!!next-input-character;
3964     redo A;
3965     } elsif ($self->{nc} == -1) {
3966     !!!parse-error (type => 'unclosed md'); ## TODO: type
3967     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3968     ## Reconsume.
3969     redo A;
3970     } elsif ($self->{nc} == 0x003E) { # >
3971     ## XML5: Switch to the "DOCTYPE bogus comment state".
3972     !!!parse-error (type => 'no md name'); ## TODO: type
3973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974     !!!next-input-character;
3975     redo A;
3976     } else {
3977     ## XML5: Switch to the "DOCTYPE bogus comment state".
3978     !!!parse-error (type => 'no space before md name'); ## TODO: type
3979     $self->{state} = BEFORE_MD_NAME_STATE;
3980     redo A;
3981     }
3982     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3983     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3984     ## before state", "DOCTYPE ATTLIST name before state".
3985    
3986     if ($is_space->{$self->{nc}}) {
3987     ## Stay in the state.
3988     !!!next-input-character;
3989     redo A;
3990     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3991     $self->{nc} == 0x0025) { # %
3992     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3993     !!!next-input-character;
3994     redo A;
3995     } elsif ($self->{nc} == 0x003E) { # >
3996     ## XML5: Same as "Anything else".
3997     !!!parse-error (type => 'no md name'); ## TODO: type
3998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3999     !!!next-input-character;
4000     redo A;
4001     } elsif ($self->{nc} == -1) {
4002     !!!parse-error (type => 'unclosed md'); ## TODO: type
4003     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4004     ## Reconsume.
4005     redo A;
4006     } else {
4007     ## XML5: [ATTLIST] Not defined yet.
4008     $self->{ct}->{name} .= chr $self->{nc};
4009     $self->{state} = MD_NAME_STATE;
4010     !!!next-input-character;
4011     redo A;
4012     }
4013     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4014     if ($is_space->{$self->{nc}}) {
4015     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4016     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4017     $self->{state} = BEFORE_MD_NAME_STATE;
4018     !!!next-input-character;
4019     redo A;
4020     } elsif ($self->{nc} == 0x003E) { # >
4021     ## XML5: Same as "Anything else".
4022     !!!parse-error (type => 'no md name'); ## TODO: type
4023     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4024     !!!next-input-character;
4025     redo A;
4026     } elsif ($self->{nc} == -1) {
4027     !!!parse-error (type => 'unclosed md');
4028     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4029     ## Reconsume.
4030     redo A;
4031     } else {
4032     ## XML5: No parse error.
4033     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4034     $self->{state} = BOGUS_COMMENT_STATE;
4035     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4036     ## Reconsume.
4037     redo A;
4038     }
4039     } elsif ($self->{state} == MD_NAME_STATE) {
4040     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4041    
4042     if ($is_space->{$self->{nc}}) {
4043 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4044     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4045     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4046     ## TODO: ...
4047     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4048     } else { # ENTITY/NOTATION
4049     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4050     }
4051 wakaba 1.14 !!!next-input-character;
4052     redo A;
4053     } elsif ($self->{nc} == 0x003E) { # >
4054     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4055     #
4056     } else {
4057 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4058 wakaba 1.14 }
4059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060     !!!next-input-character;
4061     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4062     redo A;
4063     } elsif ($self->{nc} == -1) {
4064     ## XML5: [ATTLIST] No parse error.
4065     !!!parse-error (type => 'unclosed md');
4066     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4067     ## Reconsume.
4068     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4069     redo A;
4070     } else {
4071     ## XML5: [ATTLIST] Not defined yet.
4072     $self->{ct}->{name} .= chr $self->{nc};
4073     ## Stay in the state.
4074     !!!next-input-character;
4075     redo A;
4076     }
4077     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4078     if ($is_space->{$self->{nc}}) {
4079     ## Stay in the state.
4080     !!!next-input-character;
4081     redo A;
4082     } elsif ($self->{nc} == 0x003E) { # >
4083     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4084     !!!next-input-character;
4085     !!!emit ($self->{ct}); # ATTLIST
4086     redo A;
4087     } elsif ($self->{nc} == -1) {
4088     ## XML5: No parse error.
4089     !!!parse-error (type => 'unclosed md'); ## TODO: type
4090     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4091 wakaba 1.15 !!!emit ($self->{ct});
4092     redo A;
4093     } else {
4094     ## XML5: Not defined yet.
4095     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4096     tokens => [],
4097     line => $self->{line}, column => $self->{column}};
4098     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4099     !!!next-input-character;
4100     redo A;
4101     }
4102     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4103     if ($is_space->{$self->{nc}}) {
4104     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4105     !!!next-input-character;
4106     redo A;
4107     } elsif ($self->{nc} == 0x003E) { # >
4108     ## XML5: Same as "anything else".
4109     !!!parse-error (type => 'no attr type'); ## TODO: type
4110     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4111     !!!next-input-character;
4112     !!!emit ($self->{ct}); # ATTLIST
4113     redo A;
4114     } elsif ($self->{nc} == 0x0028) { # (
4115     ## XML5: Same as "anything else".
4116     !!!parse-error (type => 'no space before paren'); ## TODO: type
4117     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4118     !!!next-input-character;
4119     redo A;
4120     } elsif ($self->{nc} == -1) {
4121     ## XML5: No parse error.
4122     !!!parse-error (type => 'unclosed md'); ## TODO: type
4123     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4124     !!!next-input-character;
4125     !!!emit ($self->{ct}); # ATTLIST
4126     redo A;
4127     } else {
4128     ## XML5: Not defined yet.
4129     $self->{ca}->{name} .= chr $self->{nc};
4130     ## Stay in the state.
4131     !!!next-input-character;
4132     redo A;
4133     }
4134     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4135     if ($is_space->{$self->{nc}}) {
4136     ## Stay in the state.
4137     !!!next-input-character;
4138     redo A;
4139     } elsif ($self->{nc} == 0x003E) { # >
4140     ## XML5: Same as "anything else".
4141     !!!parse-error (type => 'no attr type'); ## TODO: type
4142     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4143     !!!next-input-character;
4144     !!!emit ($self->{ct}); # ATTLIST
4145     redo A;
4146     } elsif ($self->{nc} == 0x0028) { # (
4147     ## XML5: Same as "anything else".
4148     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4149     !!!next-input-character;
4150     redo A;
4151     } elsif ($self->{nc} == -1) {
4152     ## XML5: No parse error.
4153     !!!parse-error (type => 'unclosed md'); ## TODO: type
4154     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4155     !!!next-input-character;
4156     !!!emit ($self->{ct});
4157 wakaba 1.14 redo A;
4158     } else {
4159     ## XML5: Not defined yet.
4160 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4161     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4162     !!!next-input-character;
4163     redo A;
4164     }
4165     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4166     if ($is_space->{$self->{nc}}) {
4167     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4168     !!!next-input-character;
4169     redo A;
4170     } elsif ($self->{nc} == 0x0023) { # #
4171     ## XML5: Same as "anything else".
4172     !!!parse-error (type => 'no space before default value'); ## TODO: type
4173     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4174     !!!next-input-character;
4175     redo A;
4176     } elsif ($self->{nc} == 0x0022) { # "
4177     ## XML5: Same as "anything else".
4178     !!!parse-error (type => 'no space before default value'); ## TODO: type
4179     $self->{ca}->{value} = '';
4180     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4181     !!!next-input-character;
4182     redo A;
4183     } elsif ($self->{nc} == 0x0027) { # '
4184     ## XML5: Same as "anything else".
4185     !!!parse-error (type => 'no space before default value'); ## TODO: type
4186     $self->{ca}->{value} = '';
4187     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4188     !!!next-input-character;
4189     redo A;
4190     } elsif ($self->{nc} == 0x003E) { # >
4191     ## XML5: Same as "anything else".
4192     !!!parse-error (type => 'no attr default'); ## TODO: type
4193     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4194     !!!next-input-character;
4195     !!!emit ($self->{ct}); # ATTLIST
4196     redo A;
4197     } elsif ($self->{nc} == 0x0028) { # (
4198     ## XML5: Same as "anything else".
4199     !!!parse-error (type => 'no space before paren'); ## TODO: type
4200     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4201     !!!next-input-character;
4202     redo A;
4203     } elsif ($self->{nc} == -1) {
4204     ## XML5: No parse error.
4205     !!!parse-error (type => 'unclosed md'); ## TODO: type
4206     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4207     !!!next-input-character;
4208     !!!emit ($self->{ct});
4209     redo A;
4210     } else {
4211     ## XML5: Not defined yet.
4212     $self->{ca}->{type} .= chr $self->{nc};
4213     ## Stay in the state.
4214     !!!next-input-character;
4215     redo A;
4216     }
4217     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4218     if ($is_space->{$self->{nc}}) {
4219     ## Stay in the state.
4220     !!!next-input-character;
4221     redo A;
4222     } elsif ($self->{nc} == 0x0028) { # (
4223     ## XML5: Same as "anything else".
4224     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4225     !!!next-input-character;
4226     redo A;
4227     } elsif ($self->{nc} == 0x0023) { # #
4228     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4229     !!!next-input-character;
4230     redo A;
4231     } elsif ($self->{nc} == 0x0022) { # "
4232     ## XML5: Same as "anything else".
4233     $self->{ca}->{value} = '';
4234     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4235     !!!next-input-character;
4236     redo A;
4237     } elsif ($self->{nc} == 0x0027) { # '
4238     ## XML5: Same as "anything else".
4239     $self->{ca}->{value} = '';
4240     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4241     !!!next-input-character;
4242     redo A;
4243     } elsif ($self->{nc} == 0x003E) { # >
4244     ## XML5: Same as "anything else".
4245     !!!parse-error (type => 'no attr default'); ## TODO: type
4246     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247     !!!next-input-character;
4248     !!!emit ($self->{ct}); # ATTLIST
4249     redo A;
4250     } elsif ($self->{nc} == -1) {
4251     ## XML5: No parse error.
4252     !!!parse-error (type => 'unclosed md'); ## TODO: type
4253     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254     !!!next-input-character;
4255     !!!emit ($self->{ct});
4256     redo A;
4257     } else {
4258     ## XML5: Switch to the "DOCTYPE bogus comment state".
4259     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4260     $self->{ca}->{value} = '';
4261     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262     ## Reconsume.
4263     redo A;
4264     }
4265     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4266     if ($is_space->{$self->{nc}}) {
4267     ## Stay in the state.
4268     !!!next-input-character;
4269     redo A;
4270     } elsif ($self->{nc} == 0x007C) { # |
4271     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4272     ## Stay in the state.
4273     !!!next-input-character;
4274     redo A;
4275     } elsif ($self->{nc} == 0x0029) { # )
4276     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4277     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4278     !!!next-input-character;
4279     redo A;
4280     } elsif ($self->{nc} == 0x003E) { # >
4281     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4282     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283     !!!next-input-character;
4284     !!!emit ($self->{ct}); # ATTLIST
4285     redo A;
4286     } elsif ($self->{nc} == -1) {
4287     ## XML5: No parse error.
4288     !!!parse-error (type => 'unclosed md'); ## TODO: type
4289     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290     !!!next-input-character;
4291     !!!emit ($self->{ct});
4292     redo A;
4293     } else {
4294     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4295     $self->{state} = ALLOWED_TOKEN_STATE;
4296     !!!next-input-character;
4297     redo A;
4298     }
4299     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4300     if ($is_space->{$self->{nc}}) {
4301     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4302     !!!next-input-character;
4303     redo A;
4304     } elsif ($self->{nc} == 0x007C) { # |
4305     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306     !!!next-input-character;
4307     redo A;
4308     } elsif ($self->{nc} == 0x0029) { # )
4309     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4310     !!!next-input-character;
4311     redo A;
4312     } elsif ($self->{nc} == 0x003E) { # >
4313     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4314     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4315     !!!next-input-character;
4316     !!!emit ($self->{ct}); # ATTLIST
4317     redo A;
4318     } elsif ($self->{nc} == -1) {
4319     ## XML5: No parse error.
4320     !!!parse-error (type => 'unclosed md'); ## TODO: type
4321     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4322     !!!next-input-character;
4323     !!!emit ($self->{ct});
4324     redo A;
4325     } else {
4326     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4327     ## Stay in the state.
4328     !!!next-input-character;
4329     redo A;
4330     }
4331     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4332     if ($is_space->{$self->{nc}}) {
4333     ## Stay in the state.
4334     !!!next-input-character;
4335     redo A;
4336     } elsif ($self->{nc} == 0x007C) { # |
4337     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338     !!!next-input-character;
4339     redo A;
4340     } elsif ($self->{nc} == 0x0029) { # )
4341     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4342     !!!next-input-character;
4343     redo A;
4344     } elsif ($self->{nc} == 0x003E) { # >
4345     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4346     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4347     !!!next-input-character;
4348     !!!emit ($self->{ct}); # ATTLIST
4349     redo A;
4350     } elsif ($self->{nc} == -1) {
4351     ## XML5: No parse error.
4352     !!!parse-error (type => 'unclosed md'); ## TODO: type
4353     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4354     !!!next-input-character;
4355     !!!emit ($self->{ct});
4356     redo A;
4357     } else {
4358     !!!parse-error (type => 'space in allowed token', ## TODO: type
4359     line => $self->{line_prev},
4360     column => $self->{column_prev});
4361     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4362     $self->{state} = ALLOWED_TOKEN_STATE;
4363     !!!next-input-character;
4364     redo A;
4365     }
4366     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4367     if ($is_space->{$self->{nc}}) {
4368     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4369     !!!next-input-character;
4370     redo A;
4371     } elsif ($self->{nc} == 0x0023) { # #
4372     !!!parse-error (type => 'no space before default value'); ## TODO: type
4373     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4374     !!!next-input-character;
4375     redo A;
4376     } elsif ($self->{nc} == 0x0022) { # "
4377     !!!parse-error (type => 'no space before default value'); ## TODO: type
4378     $self->{ca}->{value} = '';
4379     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4380     !!!next-input-character;
4381     redo A;
4382     } elsif ($self->{nc} == 0x0027) { # '
4383     !!!parse-error (type => 'no space before default value'); ## TODO: type
4384     $self->{ca}->{value} = '';
4385     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4386     !!!next-input-character;
4387     redo A;
4388     } elsif ($self->{nc} == 0x003E) { # >
4389     !!!parse-error (type => 'no attr default'); ## TODO: type
4390     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391     !!!next-input-character;
4392     !!!emit ($self->{ct}); # ATTLIST
4393     redo A;
4394     } elsif ($self->{nc} == -1) {
4395     !!!parse-error (type => 'unclosed md'); ## TODO: type
4396     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4397     !!!next-input-character;
4398     !!!emit ($self->{ct});
4399     redo A;
4400     } else {
4401     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4402     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4403     ## Reconsume.
4404     redo A;
4405     }
4406     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4407     if ($is_space->{$self->{nc}}) {
4408     ## Stay in the state.
4409     !!!next-input-character;
4410     redo A;
4411     } elsif ($self->{nc} == 0x0023) { # #
4412     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4413     !!!next-input-character;
4414     redo A;
4415     } elsif ($self->{nc} == 0x0022) { # "
4416     $self->{ca}->{value} = '';
4417     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4418     !!!next-input-character;
4419     redo A;
4420     } elsif ($self->{nc} == 0x0027) { # '
4421     $self->{ca}->{value} = '';
4422     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4423     !!!next-input-character;
4424     redo A;
4425     } elsif ($self->{nc} == 0x003E) { # >
4426     !!!parse-error (type => 'no attr default'); ## TODO: type
4427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428     !!!next-input-character;
4429     !!!emit ($self->{ct}); # ATTLIST
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432     !!!parse-error (type => 'unclosed md'); ## TODO: type
4433     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434     !!!next-input-character;
4435     !!!emit ($self->{ct});
4436     redo A;
4437     } else {
4438     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4439     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4440     ## Reconsume.
4441     redo A;
4442     }
4443     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4444     if ($is_space->{$self->{nc}}) {
4445     ## XML5: No parse error.
4446     !!!parse-error (type => 'no default type'); ## TODO: type
4447 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4448 wakaba 1.14 ## Reconsume.
4449     redo A;
4450 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4451     ## XML5: Same as "anything else".
4452     $self->{ca}->{value} = '';
4453     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4454     !!!next-input-character;
4455     redo A;
4456     } elsif ($self->{nc} == 0x0027) { # '
4457     ## XML5: Same as "anything else".
4458     $self->{ca}->{value} = '';
4459     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4460     !!!next-input-character;
4461     redo A;
4462     } elsif ($self->{nc} == 0x003E) { # >
4463     ## XML5: Same as "anything else".
4464     !!!parse-error (type => 'no attr default'); ## TODO: type
4465     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466     !!!next-input-character;
4467     !!!emit ($self->{ct}); # ATTLIST
4468     redo A;
4469     } elsif ($self->{nc} == -1) {
4470     ## XML5: No parse error.
4471     !!!parse-error (type => 'unclosed md'); ## TODO: type
4472     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4473     !!!next-input-character;
4474     !!!emit ($self->{ct});
4475     redo A;
4476     } else {
4477     $self->{ca}->{default} = chr $self->{nc};
4478     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4479     !!!next-input-character;
4480     redo A;
4481 wakaba 1.14 }
4482 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4483     if ($is_space->{$self->{nc}}) {
4484     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4485     !!!next-input-character;
4486     redo A;
4487     } elsif ($self->{nc} == 0x0022) { # "
4488     ## XML5: Same as "anything else".
4489     !!!parse-error (type => 'no space before default value'); ## TODO: type
4490     $self->{ca}->{value} = '';
4491     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4492     !!!next-input-character;
4493     redo A;
4494     } elsif ($self->{nc} == 0x0027) { # '
4495     ## XML5: Same as "anything else".
4496     !!!parse-error (type => 'no space before default value'); ## TODO: type
4497     $self->{ca}->{value} = '';
4498     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499     !!!next-input-character;
4500     redo A;
4501     } elsif ($self->{nc} == 0x003E) { # >
4502     ## XML5: Same as "anything else".
4503     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4504     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505     !!!next-input-character;
4506     !!!emit ($self->{ct}); # ATTLIST
4507     redo A;
4508     } elsif ($self->{nc} == -1) {
4509     ## XML5: No parse error.
4510     !!!parse-error (type => 'unclosed md'); ## TODO: type
4511     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513     !!!next-input-character;
4514     !!!emit ($self->{ct});
4515     redo A;
4516     } else {
4517     $self->{ca}->{default} .= chr $self->{nc};
4518     ## Stay in the state.
4519     !!!next-input-character;
4520     redo A;
4521     }
4522     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4523     if ($is_space->{$self->{nc}}) {
4524     ## Stay in the state.
4525     !!!next-input-character;
4526     redo A;
4527     } elsif ($self->{nc} == 0x0022) { # "
4528     $self->{ca}->{value} = '';
4529     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4530     !!!next-input-character;
4531     redo A;
4532     } elsif ($self->{nc} == 0x0027) { # '
4533     $self->{ca}->{value} = '';
4534     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4535     !!!next-input-character;
4536     redo A;
4537     } elsif ($self->{nc} == 0x003E) { # >
4538     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540     !!!next-input-character;
4541     !!!emit ($self->{ct}); # ATTLIST
4542     redo A;
4543     } elsif ($self->{nc} == -1) {
4544     ## XML5: No parse error.
4545     !!!parse-error (type => 'unclosed md'); ## TODO: type
4546     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4548     !!!next-input-character;
4549     !!!emit ($self->{ct});
4550     redo A;
4551     } else {
4552     ## XML5: Not defined yet.
4553     if ($self->{ca}->{default} eq 'FIXED') {
4554     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4555     } else {
4556     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4557     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4558     }
4559     ## Reconsume.
4560     redo A;
4561     }
4562     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4563     if ($is_space->{$self->{nc}} or
4564     $self->{nc} == -1 or
4565     $self->{nc} == 0x003E) { # >
4566     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4567     ## Reconsume.
4568     redo A;
4569     } else {
4570     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4571     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4572     ## Reconsume.
4573     redo A;
4574 wakaba 1.16 }
4575 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4576     ## ASCII case-insensitive
4577     if ($self->{nc} == [
4578     undef,
4579     0x0044, # D
4580     0x0041, # A
4581     0x0054, # T
4582     ]->[length $self->{kwd}] or
4583     $self->{nc} == [
4584     undef,
4585     0x0064, # d
4586     0x0061, # a
4587     0x0074, # t
4588     ]->[length $self->{kwd}]) {
4589     !!!cp (172.2);
4590     ## Stay in the state.
4591     $self->{kwd} .= chr $self->{nc};
4592     !!!next-input-character;
4593     redo A;
4594     } elsif ((length $self->{kwd}) == 4 and
4595     ($self->{nc} == 0x0041 or # A
4596     $self->{nc} == 0x0061)) { # a
4597     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598     !!!cp (172.3);
4599     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600     text => 'NDATA',
4601     line => $self->{line_prev},
4602     column => $self->{column_prev} - 4);
4603     } else {
4604     !!!cp (172.4);
4605     }
4606     $self->{state} = AFTER_NDATA_STATE;
4607     !!!next-input-character;
4608     redo A;
4609     } else {
4610     !!!parse-error (type => 'string after literal', ## TODO: type
4611     line => $self->{line_prev},
4612     column => $self->{column_prev} + 1
4613     - length $self->{kwd});
4614     !!!cp (172.5);
4615     $self->{state} = BOGUS_MD_STATE;
4616     ## Reconsume.
4617     redo A;
4618     }
4619     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620     if ($is_space->{$self->{nc}}) {
4621     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622     !!!next-input-character;
4623     redo A;
4624     } elsif ($self->{nc} == 0x003E) { # >
4625     !!!parse-error (type => 'no notation name'); ## TODO: type
4626     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627     !!!next-input-character;
4628     !!!emit ($self->{ct}); # ENTITY
4629     redo A;
4630     } elsif ($self->{nc} == -1) {
4631     !!!parse-error (type => 'unclosed md'); ## TODO: type
4632     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633     !!!next-input-character;
4634     !!!emit ($self->{ct}); # ENTITY
4635     redo A;
4636     } else {
4637     !!!parse-error (type => 'string after literal', ## TODO: type
4638     line => $self->{line_prev},
4639     column => $self->{column_prev} + 1
4640     - length $self->{kwd});
4641     $self->{state} = BOGUS_MD_STATE;
4642     ## Reconsume.
4643     redo A;
4644     }
4645     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646     if ($is_space->{$self->{nc}}) {
4647     ## Stay in the state.
4648     !!!next-input-character;
4649     redo A;
4650     } elsif ($self->{nc} == 0x003E) { # >
4651     !!!parse-error (type => 'no notation name'); ## TODO: type
4652     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653     !!!next-input-character;
4654     !!!emit ($self->{ct}); # ENTITY
4655     redo A;
4656     } elsif ($self->{nc} == -1) {
4657     !!!parse-error (type => 'unclosed md'); ## TODO: type
4658     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659     !!!next-input-character;
4660     !!!emit ($self->{ct}); # ENTITY
4661     redo A;
4662     } else {
4663     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664     $self->{state} = NOTATION_NAME_STATE;
4665     !!!next-input-character;
4666     redo A;
4667     }
4668     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669     if ($is_space->{$self->{nc}}) {
4670     $self->{state} = AFTER_NOTATION_NAME_STATE;
4671     !!!next-input-character;
4672     redo A;
4673     } elsif ($self->{nc} == 0x003E) { # >
4674     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675     !!!next-input-character;
4676     !!!emit ($self->{ct}); # ENTITY
4677     redo A;
4678     } elsif ($self->{nc} == -1) {
4679     !!!parse-error (type => 'unclosed md'); ## TODO: type
4680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681     !!!next-input-character;
4682     !!!emit ($self->{ct}); # ENTITY
4683     redo A;
4684     } else {
4685     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686     ## Stay in the state.
4687     !!!next-input-character;
4688     redo A;
4689     }
4690 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691     if ($self->{nc} == 0x0022) { # "
4692     $self->{state} = AFTER_NOTATION_NAME_STATE;
4693     !!!next-input-character;
4694     redo A;
4695     } elsif ($self->{nc} == 0x0026) { # &
4696     $self->{prev_state} = $self->{state};
4697     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698     $self->{entity_add} = 0x0022; # "
4699     !!!next-input-character;
4700     redo A;
4701     ## TODO: %
4702     } elsif ($self->{nc} == -1) {
4703     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705     ## Reconsume.
4706     !!!emit ($self->{ct}); # ENTITY
4707     redo A;
4708     } else {
4709     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710     !!!next-input-character;
4711     redo A;
4712     }
4713     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714     if ($self->{nc} == 0x0027) { # '
4715     $self->{state} = AFTER_NOTATION_NAME_STATE;
4716     !!!next-input-character;
4717     redo A;
4718     } elsif ($self->{nc} == 0x0026) { # &
4719     $self->{prev_state} = $self->{state};
4720     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721     $self->{entity_add} = 0x0027; # '
4722     !!!next-input-character;
4723     redo A;
4724     ## TODO: %
4725     } elsif ($self->{nc} == -1) {
4726     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728     ## Reconsume.
4729     !!!emit ($self->{ct}); # ENTITY
4730     redo A;
4731     } else {
4732     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733     !!!next-input-character;
4734     redo A;
4735     }
4736     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737     ## TODO: XMLize
4738    
4739     if ($is_space->{$self->{nc}} or
4740     {
4741     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742     $self->{entity_add} => 1,
4743     }->{$self->{nc}}) {
4744     ## Don't consume
4745     ## No error
4746     ## Return nothing.
4747     #
4748     } elsif ($self->{nc} == 0x0023) { # #
4749     $self->{ca} = $self->{ct};
4750     $self->{state} = ENTITY_HASH_STATE;
4751     $self->{kwd} = '#';
4752     !!!next-input-character;
4753     redo A;
4754     } elsif ((0x0041 <= $self->{nc} and
4755     $self->{nc} <= 0x005A) or # A..Z
4756     (0x0061 <= $self->{nc} and
4757     $self->{nc} <= 0x007A)) { # a..z
4758     #
4759     } else {
4760     !!!parse-error (type => 'bare ero');
4761     ## Return nothing.
4762     #
4763     }
4764    
4765     $self->{ct}->{value} .= '&';
4766     $self->{state} = $self->{prev_state};
4767     ## Reconsume.
4768     redo A;
4769 wakaba 1.18 } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770     if ($is_space->{$self->{nc}}) {
4771     ## Stay in the state.
4772     !!!next-input-character;
4773     redo A;
4774     } elsif ($self->{nc} == 0x003E) { # >
4775     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776     !!!next-input-character;
4777     !!!emit ($self->{ct}); # ENTITY
4778     redo A;
4779     } elsif ($self->{nc} == -1) {
4780     !!!parse-error (type => 'unclosed md'); ## TODO: type
4781     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782     !!!next-input-character;
4783     !!!emit ($self->{ct}); # ENTITY
4784     redo A;
4785     } else {
4786     !!!parse-error (type => 'string after notation name'); ## TODO: type
4787     $self->{state} = BOGUS_MD_STATE;
4788     ## Reconsume.
4789     redo A;
4790     }
4791 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
4792     if ($self->{nc} == 0x003E) { # >
4793     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794     !!!next-input-character;
4795     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4796     redo A;
4797     } elsif ($self->{nc} == -1) {
4798     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4799     ## Reconsume.
4800     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4801     redo A;
4802     } else {
4803     ## Stay in the state.
4804     !!!next-input-character;
4805     redo A;
4806     }
4807 wakaba 1.1 } else {
4808     die "$0: $self->{state}: Unknown state";
4809     }
4810     } # A
4811    
4812     die "$0: _get_next_token: unexpected case";
4813     } # _get_next_token
4814    
4815     1;
4816 wakaba 1.19 ## $Date: 2008/10/19 06:14:57 $
4817 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24