/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (hide annotations) (download) (as text)
Sun Oct 19 06:14:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.17: +196 -6 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	19 Oct 2008 06:14:42 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/entities-1.dat" and "xml/entities-2.dat"
	added.  Support for the "#entities" directive.

++ whatpm/t/xml/ChangeLog	19 Oct 2008 06:11:59 -0000
	* entities-1.dat, entities-2.dat: New test data files.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 06:12:27 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (notation_name): New attribute.

	* NanoDOM.pm (public_id, system_id): New attributes.a
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 06:13:03 -0000
	* Dumper.pm: Dump text content of Entity nodes.

	* Tokenizer.pm.src: Support for <!ENTITY ... NDATA>.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 06:14:05 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): General and parameter entities
	implemented.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.18 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185     sub AFTER_NOTATION_NAME_STATE () { 90 }
186     sub BOGUS_MD_STATE () { 91 }
187 wakaba 1.8
188 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
189     ## list and descriptions)
190    
191     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
192     sub FOREIGN_EL () { 0b1_00000000000 }
193    
194     ## Character reference mappings
195    
196     my $charref_map = {
197     0x0D => 0x000A,
198     0x80 => 0x20AC,
199     0x81 => 0xFFFD,
200     0x82 => 0x201A,
201     0x83 => 0x0192,
202     0x84 => 0x201E,
203     0x85 => 0x2026,
204     0x86 => 0x2020,
205     0x87 => 0x2021,
206     0x88 => 0x02C6,
207     0x89 => 0x2030,
208     0x8A => 0x0160,
209     0x8B => 0x2039,
210     0x8C => 0x0152,
211     0x8D => 0xFFFD,
212     0x8E => 0x017D,
213     0x8F => 0xFFFD,
214     0x90 => 0xFFFD,
215     0x91 => 0x2018,
216     0x92 => 0x2019,
217     0x93 => 0x201C,
218     0x94 => 0x201D,
219     0x95 => 0x2022,
220     0x96 => 0x2013,
221     0x97 => 0x2014,
222     0x98 => 0x02DC,
223     0x99 => 0x2122,
224     0x9A => 0x0161,
225     0x9B => 0x203A,
226     0x9C => 0x0153,
227     0x9D => 0xFFFD,
228     0x9E => 0x017E,
229     0x9F => 0x0178,
230     }; # $charref_map
231     $charref_map->{$_} = 0xFFFD
232     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
233     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
234     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
235     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
236     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
237     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
238     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
239    
240     ## Implementations MUST act as if state machine in the spec
241    
242     sub _initialize_tokenizer ($) {
243     my $self = shift;
244    
245     ## NOTE: Fields set by |new| constructor:
246     #$self->{level}
247     #$self->{set_nc}
248     #$self->{parse_error}
249 wakaba 1.3 #$self->{is_xml} (if XML)
250 wakaba 1.1
251     $self->{state} = DATA_STATE; # MUST
252 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
253     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254 wakaba 1.1 #$self->{entity__value}; # initialized when used
255     #$self->{entity__match}; # initialized when used
256     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
257     undef $self->{ct}; # current token
258     undef $self->{ca}; # current attribute
259     undef $self->{last_stag_name}; # last emitted start tag name
260     #$self->{prev_state}; # initialized when used
261     delete $self->{self_closing};
262     $self->{char_buffer} = '';
263     $self->{char_buffer_pos} = 0;
264     $self->{nc} = -1; # next input character
265     #$self->{next_nc}
266     !!!next-input-character;
267     $self->{token} = [];
268     # $self->{escape}
269     } # _initialize_tokenizer
270    
271     ## A token has:
272     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
273 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
274 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
275     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
276 wakaba 1.11 ## ->{target} (PI_TOKEN)
277 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
278     ## ->{sysid} (DOCTYPE_TOKEN)
279     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
280     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
281     ## ->{name}
282     ## ->{value}
283     ## ->{has_reference} == 1 or 0
284 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
285     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
286 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
287 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
288 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
289    
290 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
291     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
292     ## while the token is pushed back to the stack.
293    
294     ## Emitted token MUST immediately be handled by the tree construction state.
295    
296     ## Before each step, UA MAY check to see if either one of the scripts in
297     ## "list of scripts that will execute as soon as possible" or the first
298     ## script in the "list of scripts that will execute asynchronously",
299     ## has completed loading. If one has, then it MUST be executed
300     ## and removed from the list.
301    
302     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
303     ## (This requirement was dropped from HTML5 spec, unfortunately.)
304    
305     my $is_space = {
306     0x0009 => 1, # CHARACTER TABULATION (HT)
307     0x000A => 1, # LINE FEED (LF)
308     #0x000B => 0, # LINE TABULATION (VT)
309 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
310 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
311     0x0020 => 1, # SPACE (SP)
312     };
313    
314     sub _get_next_token ($) {
315     my $self = shift;
316    
317     if ($self->{self_closing}) {
318     !!!parse-error (type => 'nestc', token => $self->{ct});
319     ## NOTE: The |self_closing| flag is only set by start tag token.
320     ## In addition, when a start tag token is emitted, it is always set to
321     ## |ct|.
322     delete $self->{self_closing};
323     }
324    
325     if (@{$self->{token}}) {
326     $self->{self_closing} = $self->{token}->[0]->{self_closing};
327     return shift @{$self->{token}};
328     }
329    
330     A: {
331     if ($self->{state} == PCDATA_STATE) {
332     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
333    
334     if ($self->{nc} == 0x0026) { # &
335     !!!cp (0.1);
336     ## NOTE: In the spec, the tokenizer is switched to the
337     ## "entity data state". In this implementation, the tokenizer
338     ## is switched to the |ENTITY_STATE|, which is an implementation
339     ## of the "consume a character reference" algorithm.
340     $self->{entity_add} = -1;
341     $self->{prev_state} = DATA_STATE;
342     $self->{state} = ENTITY_STATE;
343     !!!next-input-character;
344     redo A;
345     } elsif ($self->{nc} == 0x003C) { # <
346     !!!cp (0.2);
347     $self->{state} = TAG_OPEN_STATE;
348     !!!next-input-character;
349     redo A;
350     } elsif ($self->{nc} == -1) {
351     !!!cp (0.3);
352     !!!emit ({type => END_OF_FILE_TOKEN,
353     line => $self->{line}, column => $self->{column}});
354     last A; ## TODO: ok?
355     } else {
356     !!!cp (0.4);
357     #
358     }
359    
360     # Anything else
361     my $token = {type => CHARACTER_TOKEN,
362     data => chr $self->{nc},
363     line => $self->{line}, column => $self->{column},
364     };
365     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
366    
367     ## Stay in the state.
368     !!!next-input-character;
369     !!!emit ($token);
370     redo A;
371     } elsif ($self->{state} == DATA_STATE) {
372     $self->{s_kwd} = '' unless defined $self->{s_kwd};
373     if ($self->{nc} == 0x0026) { # &
374     $self->{s_kwd} = '';
375     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
376     not $self->{escape}) {
377     !!!cp (1);
378     ## NOTE: In the spec, the tokenizer is switched to the
379     ## "entity data state". In this implementation, the tokenizer
380     ## is switched to the |ENTITY_STATE|, which is an implementation
381     ## of the "consume a character reference" algorithm.
382     $self->{entity_add} = -1;
383     $self->{prev_state} = DATA_STATE;
384     $self->{state} = ENTITY_STATE;
385     !!!next-input-character;
386     redo A;
387     } else {
388     !!!cp (2);
389     #
390     }
391     } elsif ($self->{nc} == 0x002D) { # -
392     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
393 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
394 wakaba 1.1 !!!cp (3);
395     $self->{escape} = 1; # unless $self->{escape};
396     $self->{s_kwd} = '--';
397     #
398 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
399 wakaba 1.1 !!!cp (4);
400     $self->{s_kwd} = '--';
401     #
402 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
403     !!!cp (4.1);
404     $self->{s_kwd} .= '-';
405     #
406 wakaba 1.1 } else {
407     !!!cp (5);
408 wakaba 1.5 $self->{s_kwd} = '-';
409 wakaba 1.1 #
410     }
411     }
412    
413     #
414     } elsif ($self->{nc} == 0x0021) { # !
415     if (length $self->{s_kwd}) {
416     !!!cp (5.1);
417     $self->{s_kwd} .= '!';
418     #
419     } else {
420     !!!cp (5.2);
421     #$self->{s_kwd} = '';
422     #
423     }
424     #
425     } elsif ($self->{nc} == 0x003C) { # <
426     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
427     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
428     not $self->{escape})) {
429     !!!cp (6);
430     $self->{state} = TAG_OPEN_STATE;
431     !!!next-input-character;
432     redo A;
433     } else {
434     !!!cp (7);
435     $self->{s_kwd} = '';
436     #
437     }
438     } elsif ($self->{nc} == 0x003E) { # >
439     if ($self->{escape} and
440     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
441     if ($self->{s_kwd} eq '--') {
442     !!!cp (8);
443     delete $self->{escape};
444 wakaba 1.5 #
445 wakaba 1.1 } else {
446     !!!cp (9);
447 wakaba 1.5 #
448 wakaba 1.1 }
449 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
450     !!!cp (9.1);
451     !!!parse-error (type => 'unmatched mse', ## TODO: type
452     line => $self->{line_prev},
453     column => $self->{column_prev} - 1);
454     #
455 wakaba 1.1 } else {
456     !!!cp (10);
457 wakaba 1.5 #
458 wakaba 1.1 }
459    
460     $self->{s_kwd} = '';
461     #
462 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
463     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
464     !!!cp (10.1);
465     $self->{s_kwd} .= ']';
466     } elsif ($self->{s_kwd} eq ']]') {
467     !!!cp (10.2);
468     #
469     } else {
470     !!!cp (10.3);
471     $self->{s_kwd} = '';
472     }
473     #
474 wakaba 1.1 } elsif ($self->{nc} == -1) {
475     !!!cp (11);
476     $self->{s_kwd} = '';
477     !!!emit ({type => END_OF_FILE_TOKEN,
478     line => $self->{line}, column => $self->{column}});
479     last A; ## TODO: ok?
480     } else {
481     !!!cp (12);
482     $self->{s_kwd} = '';
483     #
484     }
485    
486     # Anything else
487     my $token = {type => CHARACTER_TOKEN,
488     data => chr $self->{nc},
489     line => $self->{line}, column => $self->{column},
490     };
491 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
492 wakaba 1.1 length $token->{data})) {
493     $self->{s_kwd} = '';
494     }
495    
496     ## Stay in the data state.
497 wakaba 1.5 if (not $self->{is_xml} and
498     $self->{content_model} == PCDATA_CONTENT_MODEL) {
499 wakaba 1.1 !!!cp (13);
500     $self->{state} = PCDATA_STATE;
501     } else {
502     !!!cp (14);
503     ## Stay in the state.
504     }
505     !!!next-input-character;
506     !!!emit ($token);
507     redo A;
508     } elsif ($self->{state} == TAG_OPEN_STATE) {
509 wakaba 1.10 ## XML5: "tag state".
510    
511 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
512     if ($self->{nc} == 0x002F) { # /
513     !!!cp (15);
514     !!!next-input-character;
515     $self->{state} = CLOSE_TAG_OPEN_STATE;
516     redo A;
517     } elsif ($self->{nc} == 0x0021) { # !
518     !!!cp (15.1);
519 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
520 wakaba 1.1 #
521     } else {
522     !!!cp (16);
523 wakaba 1.12 $self->{s_kwd} = '';
524 wakaba 1.1 #
525     }
526    
527     ## reconsume
528     $self->{state} = DATA_STATE;
529     !!!emit ({type => CHARACTER_TOKEN, data => '<',
530     line => $self->{line_prev},
531     column => $self->{column_prev},
532     });
533     redo A;
534     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535     if ($self->{nc} == 0x0021) { # !
536     !!!cp (17);
537     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538     !!!next-input-character;
539     redo A;
540     } elsif ($self->{nc} == 0x002F) { # /
541     !!!cp (18);
542     $self->{state} = CLOSE_TAG_OPEN_STATE;
543     !!!next-input-character;
544     redo A;
545     } elsif (0x0041 <= $self->{nc} and
546     $self->{nc} <= 0x005A) { # A..Z
547     !!!cp (19);
548     $self->{ct}
549     = {type => START_TAG_TOKEN,
550 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
551 wakaba 1.1 line => $self->{line_prev},
552     column => $self->{column_prev}};
553     $self->{state} = TAG_NAME_STATE;
554     !!!next-input-character;
555     redo A;
556     } elsif (0x0061 <= $self->{nc} and
557     $self->{nc} <= 0x007A) { # a..z
558     !!!cp (20);
559     $self->{ct} = {type => START_TAG_TOKEN,
560     tag_name => chr ($self->{nc}),
561     line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif ($self->{nc} == 0x003E) { # >
567     !!!cp (21);
568     !!!parse-error (type => 'empty start tag',
569     line => $self->{line_prev},
570     column => $self->{column_prev});
571     $self->{state} = DATA_STATE;
572 wakaba 1.5 $self->{s_kwd} = '';
573 wakaba 1.1 !!!next-input-character;
574    
575     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
576     line => $self->{line_prev},
577     column => $self->{column_prev},
578     });
579    
580     redo A;
581     } elsif ($self->{nc} == 0x003F) { # ?
582 wakaba 1.8 if ($self->{is_xml}) {
583     !!!cp (22.1);
584     $self->{state} = PI_STATE;
585     !!!next-input-character;
586     redo A;
587     } else {
588     !!!cp (22);
589     !!!parse-error (type => 'pio',
590     line => $self->{line_prev},
591     column => $self->{column_prev});
592     $self->{state} = BOGUS_COMMENT_STATE;
593     $self->{ct} = {type => COMMENT_TOKEN, data => '',
594     line => $self->{line_prev},
595     column => $self->{column_prev},
596     };
597     ## $self->{nc} is intentionally left as is
598     redo A;
599     }
600 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
601 wakaba 1.1 !!!cp (23);
602     !!!parse-error (type => 'bare stago',
603     line => $self->{line_prev},
604     column => $self->{column_prev});
605     $self->{state} = DATA_STATE;
606 wakaba 1.5 $self->{s_kwd} = '';
607 wakaba 1.1 ## reconsume
608    
609     !!!emit ({type => CHARACTER_TOKEN, data => '<',
610     line => $self->{line_prev},
611     column => $self->{column_prev},
612     });
613    
614     redo A;
615 wakaba 1.9 } else {
616     ## XML5: "<:" is a parse error.
617     !!!cp (23.1);
618     $self->{ct} = {type => START_TAG_TOKEN,
619     tag_name => chr ($self->{nc}),
620     line => $self->{line_prev},
621     column => $self->{column_prev}};
622     $self->{state} = TAG_NAME_STATE;
623     !!!next-input-character;
624     redo A;
625 wakaba 1.1 }
626     } else {
627     die "$0: $self->{content_model} in tag open";
628     }
629     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
630     ## NOTE: The "close tag open state" in the spec is implemented as
631     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
632    
633 wakaba 1.10 ## XML5: "end tag state".
634    
635 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
636     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
637     if (defined $self->{last_stag_name}) {
638     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
639 wakaba 1.12 $self->{kwd} = '';
640 wakaba 1.1 ## Reconsume.
641     redo A;
642     } else {
643     ## No start tag token has ever been emitted
644     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
645     !!!cp (28);
646     $self->{state} = DATA_STATE;
647 wakaba 1.5 $self->{s_kwd} = '';
648 wakaba 1.1 ## Reconsume.
649     !!!emit ({type => CHARACTER_TOKEN, data => '</',
650     line => $l, column => $c,
651     });
652     redo A;
653     }
654     }
655    
656     if (0x0041 <= $self->{nc} and
657     $self->{nc} <= 0x005A) { # A..Z
658     !!!cp (29);
659     $self->{ct}
660     = {type => END_TAG_TOKEN,
661 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
662 wakaba 1.1 line => $l, column => $c};
663     $self->{state} = TAG_NAME_STATE;
664     !!!next-input-character;
665     redo A;
666     } elsif (0x0061 <= $self->{nc} and
667     $self->{nc} <= 0x007A) { # a..z
668     !!!cp (30);
669     $self->{ct} = {type => END_TAG_TOKEN,
670     tag_name => chr ($self->{nc}),
671     line => $l, column => $c};
672     $self->{state} = TAG_NAME_STATE;
673     !!!next-input-character;
674     redo A;
675     } elsif ($self->{nc} == 0x003E) { # >
676     !!!parse-error (type => 'empty end tag',
677     line => $self->{line_prev}, ## "<" in "</>"
678     column => $self->{column_prev} - 1);
679     $self->{state} = DATA_STATE;
680 wakaba 1.5 $self->{s_kwd} = '';
681 wakaba 1.10 if ($self->{is_xml}) {
682     !!!cp (31);
683     ## XML5: No parse error.
684    
685     ## NOTE: This parser raises a parse error, since it supports
686     ## XML1, not XML5.
687    
688     ## NOTE: A short end tag token.
689     my $ct = {type => END_TAG_TOKEN,
690     tag_name => '',
691     line => $self->{line_prev},
692     column => $self->{column_prev} - 1,
693     };
694     !!!next-input-character;
695     !!!emit ($ct);
696     } else {
697     !!!cp (31.1);
698     !!!next-input-character;
699     }
700 wakaba 1.1 redo A;
701     } elsif ($self->{nc} == -1) {
702     !!!cp (32);
703     !!!parse-error (type => 'bare etago');
704 wakaba 1.5 $self->{s_kwd} = '';
705 wakaba 1.1 $self->{state} = DATA_STATE;
706     # reconsume
707    
708     !!!emit ({type => CHARACTER_TOKEN, data => '</',
709     line => $l, column => $c,
710     });
711    
712     redo A;
713 wakaba 1.10 } elsif (not $self->{is_xml} or
714     $is_space->{$self->{nc}}) {
715 wakaba 1.1 !!!cp (33);
716 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
717     line => $self->{line_prev}, # "<" of "</"
718     column => $self->{column_prev} - 1);
719 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
720     $self->{ct} = {type => COMMENT_TOKEN, data => '',
721     line => $self->{line_prev}, # "<" of "</"
722     column => $self->{column_prev} - 1,
723     };
724     ## NOTE: $self->{nc} is intentionally left as is.
725     ## Although the "anything else" case of the spec not explicitly
726     ## states that the next input character is to be reconsumed,
727     ## it will be included to the |data| of the comment token
728     ## generated from the bogus end tag, as defined in the
729     ## "bogus comment state" entry.
730     redo A;
731 wakaba 1.10 } else {
732     ## XML5: "</:" is a parse error.
733     !!!cp (30.1);
734     $self->{ct} = {type => END_TAG_TOKEN,
735     tag_name => chr ($self->{nc}),
736     line => $l, column => $c};
737     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
738     !!!next-input-character;
739     redo A;
740 wakaba 1.1 }
741     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
742 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
743 wakaba 1.1 if (length $ch) {
744     my $CH = $ch;
745     $ch =~ tr/a-z/A-Z/;
746     my $nch = chr $self->{nc};
747     if ($nch eq $ch or $nch eq $CH) {
748     !!!cp (24);
749     ## Stay in the state.
750 wakaba 1.12 $self->{kwd} .= $nch;
751 wakaba 1.1 !!!next-input-character;
752     redo A;
753     } else {
754     !!!cp (25);
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## Reconsume.
758     !!!emit ({type => CHARACTER_TOKEN,
759 wakaba 1.12 data => '</' . $self->{kwd},
760 wakaba 1.1 line => $self->{line_prev},
761 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
762 wakaba 1.1 });
763     redo A;
764     }
765     } else { # after "<{tag-name}"
766     unless ($is_space->{$self->{nc}} or
767     {
768     0x003E => 1, # >
769     0x002F => 1, # /
770     -1 => 1, # EOF
771     }->{$self->{nc}}) {
772     !!!cp (26);
773     ## Reconsume.
774     $self->{state} = DATA_STATE;
775 wakaba 1.5 $self->{s_kwd} = '';
776 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
777 wakaba 1.12 data => '</' . $self->{kwd},
778 wakaba 1.1 line => $self->{line_prev},
779 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
780 wakaba 1.1 });
781     redo A;
782     } else {
783     !!!cp (27);
784     $self->{ct}
785     = {type => END_TAG_TOKEN,
786     tag_name => $self->{last_stag_name},
787     line => $self->{line_prev},
788 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
789 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
790     ## Reconsume.
791     redo A;
792     }
793     }
794     } elsif ($self->{state} == TAG_NAME_STATE) {
795     if ($is_space->{$self->{nc}}) {
796     !!!cp (34);
797     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
798     !!!next-input-character;
799     redo A;
800     } elsif ($self->{nc} == 0x003E) { # >
801     if ($self->{ct}->{type} == START_TAG_TOKEN) {
802     !!!cp (35);
803     $self->{last_stag_name} = $self->{ct}->{tag_name};
804     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
805     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
806     #if ($self->{ct}->{attributes}) {
807     # ## NOTE: This should never be reached.
808     # !!! cp (36);
809     # !!! parse-error (type => 'end tag attribute');
810     #} else {
811     !!!cp (37);
812     #}
813     } else {
814     die "$0: $self->{ct}->{type}: Unknown token type";
815     }
816     $self->{state} = DATA_STATE;
817 wakaba 1.5 $self->{s_kwd} = '';
818 wakaba 1.1 !!!next-input-character;
819    
820     !!!emit ($self->{ct}); # start tag or end tag
821    
822     redo A;
823     } elsif (0x0041 <= $self->{nc} and
824     $self->{nc} <= 0x005A) { # A..Z
825     !!!cp (38);
826 wakaba 1.4 $self->{ct}->{tag_name}
827     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
828 wakaba 1.1 # start tag or end tag
829     ## Stay in this state
830     !!!next-input-character;
831     redo A;
832     } elsif ($self->{nc} == -1) {
833     !!!parse-error (type => 'unclosed tag');
834     if ($self->{ct}->{type} == START_TAG_TOKEN) {
835     !!!cp (39);
836     $self->{last_stag_name} = $self->{ct}->{tag_name};
837     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
838     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
839     #if ($self->{ct}->{attributes}) {
840     # ## NOTE: This state should never be reached.
841     # !!! cp (40);
842     # !!! parse-error (type => 'end tag attribute');
843     #} else {
844     !!!cp (41);
845     #}
846     } else {
847     die "$0: $self->{ct}->{type}: Unknown token type";
848     }
849     $self->{state} = DATA_STATE;
850 wakaba 1.5 $self->{s_kwd} = '';
851 wakaba 1.1 # reconsume
852    
853     !!!emit ($self->{ct}); # start tag or end tag
854    
855     redo A;
856     } elsif ($self->{nc} == 0x002F) { # /
857     !!!cp (42);
858     $self->{state} = SELF_CLOSING_START_TAG_STATE;
859     !!!next-input-character;
860     redo A;
861     } else {
862     !!!cp (44);
863     $self->{ct}->{tag_name} .= chr $self->{nc};
864     # start tag or end tag
865     ## Stay in the state
866     !!!next-input-character;
867     redo A;
868     }
869     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
870 wakaba 1.11 ## XML5: "Tag attribute name before state".
871    
872 wakaba 1.1 if ($is_space->{$self->{nc}}) {
873     !!!cp (45);
874     ## Stay in the state
875     !!!next-input-character;
876     redo A;
877     } elsif ($self->{nc} == 0x003E) { # >
878     if ($self->{ct}->{type} == START_TAG_TOKEN) {
879     !!!cp (46);
880     $self->{last_stag_name} = $self->{ct}->{tag_name};
881     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
882     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
883     if ($self->{ct}->{attributes}) {
884     !!!cp (47);
885     !!!parse-error (type => 'end tag attribute');
886     } else {
887     !!!cp (48);
888     }
889     } else {
890     die "$0: $self->{ct}->{type}: Unknown token type";
891     }
892     $self->{state} = DATA_STATE;
893 wakaba 1.5 $self->{s_kwd} = '';
894 wakaba 1.1 !!!next-input-character;
895    
896     !!!emit ($self->{ct}); # start tag or end tag
897    
898     redo A;
899     } elsif (0x0041 <= $self->{nc} and
900     $self->{nc} <= 0x005A) { # A..Z
901     !!!cp (49);
902     $self->{ca}
903 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
904 wakaba 1.1 value => '',
905     line => $self->{line}, column => $self->{column}};
906     $self->{state} = ATTRIBUTE_NAME_STATE;
907     !!!next-input-character;
908     redo A;
909     } elsif ($self->{nc} == 0x002F) { # /
910     !!!cp (50);
911     $self->{state} = SELF_CLOSING_START_TAG_STATE;
912     !!!next-input-character;
913     redo A;
914     } elsif ($self->{nc} == -1) {
915     !!!parse-error (type => 'unclosed tag');
916     if ($self->{ct}->{type} == START_TAG_TOKEN) {
917     !!!cp (52);
918     $self->{last_stag_name} = $self->{ct}->{tag_name};
919     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
920     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921     if ($self->{ct}->{attributes}) {
922     !!!cp (53);
923     !!!parse-error (type => 'end tag attribute');
924     } else {
925     !!!cp (54);
926     }
927     } else {
928     die "$0: $self->{ct}->{type}: Unknown token type";
929     }
930     $self->{state} = DATA_STATE;
931 wakaba 1.5 $self->{s_kwd} = '';
932 wakaba 1.1 # reconsume
933    
934     !!!emit ($self->{ct}); # start tag or end tag
935    
936     redo A;
937     } else {
938     if ({
939     0x0022 => 1, # "
940     0x0027 => 1, # '
941     0x003D => 1, # =
942     }->{$self->{nc}}) {
943     !!!cp (55);
944 wakaba 1.11 ## XML5: Not a parse error.
945 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
946     } else {
947     !!!cp (56);
948 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
949 wakaba 1.1 }
950     $self->{ca}
951     = {name => chr ($self->{nc}),
952     value => '',
953     line => $self->{line}, column => $self->{column}};
954     $self->{state} = ATTRIBUTE_NAME_STATE;
955     !!!next-input-character;
956     redo A;
957     }
958     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
959 wakaba 1.11 ## XML5: "Tag attribute name state".
960    
961 wakaba 1.1 my $before_leave = sub {
962     if (exists $self->{ct}->{attributes} # start tag or end tag
963     ->{$self->{ca}->{name}}) { # MUST
964     !!!cp (57);
965     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
966     ## Discard $self->{ca} # MUST
967     } else {
968     !!!cp (58);
969     $self->{ct}->{attributes}->{$self->{ca}->{name}}
970     = $self->{ca};
971 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
972 wakaba 1.1 }
973     }; # $before_leave
974    
975     if ($is_space->{$self->{nc}}) {
976     !!!cp (59);
977     $before_leave->();
978     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{nc} == 0x003D) { # =
982     !!!cp (60);
983     $before_leave->();
984     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
985     !!!next-input-character;
986     redo A;
987     } elsif ($self->{nc} == 0x003E) { # >
988 wakaba 1.11 if ($self->{is_xml}) {
989     !!!cp (60.1);
990     ## XML5: Not a parse error.
991     !!!parse-error (type => 'no attr value'); ## TODO: type
992     } else {
993     !!!cp (60.2);
994     }
995    
996 wakaba 1.1 $before_leave->();
997     if ($self->{ct}->{type} == START_TAG_TOKEN) {
998     !!!cp (61);
999     $self->{last_stag_name} = $self->{ct}->{tag_name};
1000     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1001     !!!cp (62);
1002     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1003     if ($self->{ct}->{attributes}) {
1004     !!!parse-error (type => 'end tag attribute');
1005     }
1006     } else {
1007     die "$0: $self->{ct}->{type}: Unknown token type";
1008     }
1009     $self->{state} = DATA_STATE;
1010 wakaba 1.5 $self->{s_kwd} = '';
1011 wakaba 1.1 !!!next-input-character;
1012    
1013     !!!emit ($self->{ct}); # start tag or end tag
1014    
1015     redo A;
1016     } elsif (0x0041 <= $self->{nc} and
1017     $self->{nc} <= 0x005A) { # A..Z
1018     !!!cp (63);
1019 wakaba 1.4 $self->{ca}->{name}
1020     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1021 wakaba 1.1 ## Stay in the state
1022     !!!next-input-character;
1023     redo A;
1024     } elsif ($self->{nc} == 0x002F) { # /
1025 wakaba 1.11 if ($self->{is_xml}) {
1026     !!!cp (64);
1027     ## XML5: Not a parse error.
1028     !!!parse-error (type => 'no attr value'); ## TODO: type
1029     } else {
1030     !!!cp (64.1);
1031     }
1032    
1033 wakaba 1.1 $before_leave->();
1034     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1035     !!!next-input-character;
1036     redo A;
1037     } elsif ($self->{nc} == -1) {
1038     !!!parse-error (type => 'unclosed tag');
1039     $before_leave->();
1040     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1041     !!!cp (66);
1042     $self->{last_stag_name} = $self->{ct}->{tag_name};
1043     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1044     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1045     if ($self->{ct}->{attributes}) {
1046     !!!cp (67);
1047     !!!parse-error (type => 'end tag attribute');
1048     } else {
1049     ## NOTE: This state should never be reached.
1050     !!!cp (68);
1051     }
1052     } else {
1053     die "$0: $self->{ct}->{type}: Unknown token type";
1054     }
1055     $self->{state} = DATA_STATE;
1056 wakaba 1.5 $self->{s_kwd} = '';
1057 wakaba 1.1 # reconsume
1058    
1059     !!!emit ($self->{ct}); # start tag or end tag
1060    
1061     redo A;
1062     } else {
1063     if ($self->{nc} == 0x0022 or # "
1064     $self->{nc} == 0x0027) { # '
1065     !!!cp (69);
1066 wakaba 1.11 ## XML5: Not a parse error.
1067 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1068     } else {
1069     !!!cp (70);
1070     }
1071     $self->{ca}->{name} .= chr ($self->{nc});
1072     ## Stay in the state
1073     !!!next-input-character;
1074     redo A;
1075     }
1076     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1077 wakaba 1.11 ## XML5: "Tag attribute name after state".
1078    
1079 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1080     !!!cp (71);
1081     ## Stay in the state
1082     !!!next-input-character;
1083     redo A;
1084     } elsif ($self->{nc} == 0x003D) { # =
1085     !!!cp (72);
1086     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1087     !!!next-input-character;
1088     redo A;
1089     } elsif ($self->{nc} == 0x003E) { # >
1090 wakaba 1.11 if ($self->{is_xml}) {
1091     !!!cp (72.1);
1092     ## XML5: Not a parse error.
1093     !!!parse-error (type => 'no attr value'); ## TODO: type
1094     } else {
1095     !!!cp (72.2);
1096     }
1097    
1098 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1099     !!!cp (73);
1100     $self->{last_stag_name} = $self->{ct}->{tag_name};
1101     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1102     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1103     if ($self->{ct}->{attributes}) {
1104     !!!cp (74);
1105     !!!parse-error (type => 'end tag attribute');
1106     } else {
1107     ## NOTE: This state should never be reached.
1108     !!!cp (75);
1109     }
1110     } else {
1111     die "$0: $self->{ct}->{type}: Unknown token type";
1112     }
1113     $self->{state} = DATA_STATE;
1114 wakaba 1.5 $self->{s_kwd} = '';
1115 wakaba 1.1 !!!next-input-character;
1116    
1117     !!!emit ($self->{ct}); # start tag or end tag
1118    
1119     redo A;
1120     } elsif (0x0041 <= $self->{nc} and
1121     $self->{nc} <= 0x005A) { # A..Z
1122     !!!cp (76);
1123     $self->{ca}
1124 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1125 wakaba 1.1 value => '',
1126     line => $self->{line}, column => $self->{column}};
1127     $self->{state} = ATTRIBUTE_NAME_STATE;
1128     !!!next-input-character;
1129     redo A;
1130     } elsif ($self->{nc} == 0x002F) { # /
1131 wakaba 1.11 if ($self->{is_xml}) {
1132     !!!cp (77);
1133     ## XML5: Not a parse error.
1134     !!!parse-error (type => 'no attr value'); ## TODO: type
1135     } else {
1136     !!!cp (77.1);
1137     }
1138    
1139 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1140     !!!next-input-character;
1141     redo A;
1142     } elsif ($self->{nc} == -1) {
1143     !!!parse-error (type => 'unclosed tag');
1144     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145     !!!cp (79);
1146     $self->{last_stag_name} = $self->{ct}->{tag_name};
1147     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1149     if ($self->{ct}->{attributes}) {
1150     !!!cp (80);
1151     !!!parse-error (type => 'end tag attribute');
1152     } else {
1153     ## NOTE: This state should never be reached.
1154     !!!cp (81);
1155     }
1156     } else {
1157     die "$0: $self->{ct}->{type}: Unknown token type";
1158     }
1159 wakaba 1.5 $self->{s_kwd} = '';
1160 wakaba 1.1 $self->{state} = DATA_STATE;
1161     # reconsume
1162    
1163     !!!emit ($self->{ct}); # start tag or end tag
1164    
1165     redo A;
1166     } else {
1167 wakaba 1.11 if ($self->{is_xml}) {
1168     !!!cp (78.1);
1169     ## XML5: Not a parse error.
1170     !!!parse-error (type => 'no attr value'); ## TODO: type
1171     } else {
1172     !!!cp (78.2);
1173     }
1174    
1175 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1176     $self->{nc} == 0x0027) { # '
1177     !!!cp (78);
1178 wakaba 1.11 ## XML5: Not a parse error.
1179 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1180     } else {
1181     !!!cp (82);
1182     }
1183     $self->{ca}
1184     = {name => chr ($self->{nc}),
1185     value => '',
1186     line => $self->{line}, column => $self->{column}};
1187     $self->{state} = ATTRIBUTE_NAME_STATE;
1188     !!!next-input-character;
1189     redo A;
1190     }
1191     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1192 wakaba 1.11 ## XML5: "Tag attribute value before state".
1193    
1194 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1195     !!!cp (83);
1196     ## Stay in the state
1197     !!!next-input-character;
1198     redo A;
1199     } elsif ($self->{nc} == 0x0022) { # "
1200     !!!cp (84);
1201     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1202     !!!next-input-character;
1203     redo A;
1204     } elsif ($self->{nc} == 0x0026) { # &
1205     !!!cp (85);
1206     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1207     ## reconsume
1208     redo A;
1209     } elsif ($self->{nc} == 0x0027) { # '
1210     !!!cp (86);
1211     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x003E) { # >
1215     !!!parse-error (type => 'empty unquoted attribute value');
1216     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1217     !!!cp (87);
1218     $self->{last_stag_name} = $self->{ct}->{tag_name};
1219     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1220     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1221     if ($self->{ct}->{attributes}) {
1222     !!!cp (88);
1223     !!!parse-error (type => 'end tag attribute');
1224     } else {
1225     ## NOTE: This state should never be reached.
1226     !!!cp (89);
1227     }
1228     } else {
1229     die "$0: $self->{ct}->{type}: Unknown token type";
1230     }
1231     $self->{state} = DATA_STATE;
1232 wakaba 1.5 $self->{s_kwd} = '';
1233 wakaba 1.1 !!!next-input-character;
1234    
1235     !!!emit ($self->{ct}); # start tag or end tag
1236    
1237     redo A;
1238     } elsif ($self->{nc} == -1) {
1239     !!!parse-error (type => 'unclosed tag');
1240     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241     !!!cp (90);
1242     $self->{last_stag_name} = $self->{ct}->{tag_name};
1243     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245     if ($self->{ct}->{attributes}) {
1246     !!!cp (91);
1247     !!!parse-error (type => 'end tag attribute');
1248     } else {
1249     ## NOTE: This state should never be reached.
1250     !!!cp (92);
1251     }
1252     } else {
1253     die "$0: $self->{ct}->{type}: Unknown token type";
1254     }
1255     $self->{state} = DATA_STATE;
1256 wakaba 1.5 $self->{s_kwd} = '';
1257 wakaba 1.1 ## reconsume
1258    
1259     !!!emit ($self->{ct}); # start tag or end tag
1260    
1261     redo A;
1262     } else {
1263     if ($self->{nc} == 0x003D) { # =
1264     !!!cp (93);
1265 wakaba 1.11 ## XML5: Not a parse error.
1266 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1267 wakaba 1.11 } elsif ($self->{is_xml}) {
1268     !!!cp (93.1);
1269     ## XML5: No parse error.
1270     !!!parse-error (type => 'unquoted attr value'); ## TODO
1271 wakaba 1.1 } else {
1272     !!!cp (94);
1273     }
1274     $self->{ca}->{value} .= chr ($self->{nc});
1275     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1276     !!!next-input-character;
1277     redo A;
1278     }
1279     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1280 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1281     ## ATTLIST attribute value double quoted state".
1282 wakaba 1.11
1283 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1284 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1285     !!!cp (95.1);
1286     ## XML5: "DOCTYPE ATTLIST name after state".
1287     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1288     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1289     } else {
1290     !!!cp (95);
1291     ## XML5: "Tag attribute name before state".
1292     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1293     }
1294 wakaba 1.1 !!!next-input-character;
1295     redo A;
1296     } elsif ($self->{nc} == 0x0026) { # &
1297     !!!cp (96);
1298 wakaba 1.11 ## XML5: Not defined yet.
1299    
1300 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1301     ## "entity in attribute value state". In this implementation, the
1302     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1303     ## implementation of the "consume a character reference" algorithm.
1304     $self->{prev_state} = $self->{state};
1305     $self->{entity_add} = 0x0022; # "
1306     $self->{state} = ENTITY_STATE;
1307     !!!next-input-character;
1308     redo A;
1309     } elsif ($self->{nc} == -1) {
1310     !!!parse-error (type => 'unclosed attribute value');
1311     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1312     !!!cp (97);
1313     $self->{last_stag_name} = $self->{ct}->{tag_name};
1314 wakaba 1.15
1315     $self->{state} = DATA_STATE;
1316     $self->{s_kwd} = '';
1317     ## reconsume
1318     !!!emit ($self->{ct}); # start tag
1319     redo A;
1320 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1321     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1322     if ($self->{ct}->{attributes}) {
1323     !!!cp (98);
1324     !!!parse-error (type => 'end tag attribute');
1325     } else {
1326     ## NOTE: This state should never be reached.
1327     !!!cp (99);
1328     }
1329 wakaba 1.15
1330     $self->{state} = DATA_STATE;
1331     $self->{s_kwd} = '';
1332     ## reconsume
1333     !!!emit ($self->{ct}); # end tag
1334     redo A;
1335     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1336     ## XML5: No parse error above; not defined yet.
1337     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1339     ## Reconsume.
1340     !!!emit ($self->{ct}); # ATTLIST
1341     redo A;
1342 wakaba 1.1 } else {
1343     die "$0: $self->{ct}->{type}: Unknown token type";
1344     }
1345     } else {
1346 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1347 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1348     !!!cp (100);
1349     ## XML5: Not a parse error.
1350     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1351     } else {
1352     !!!cp (100.1);
1353     }
1354 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1355     $self->{read_until}->($self->{ca}->{value},
1356 wakaba 1.11 q["&<],
1357 wakaba 1.1 length $self->{ca}->{value});
1358    
1359     ## Stay in the state
1360     !!!next-input-character;
1361     redo A;
1362     }
1363     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1364 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1365     ## ATTLIST attribute value single quoted state".
1366 wakaba 1.11
1367 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1368 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1369     !!!cp (101.1);
1370     ## XML5: "DOCTYPE ATTLIST name after state".
1371     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1372     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1373     } else {
1374     !!!cp (101);
1375     ## XML5: "Before attribute name state" (sic).
1376     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1377     }
1378 wakaba 1.1 !!!next-input-character;
1379     redo A;
1380     } elsif ($self->{nc} == 0x0026) { # &
1381     !!!cp (102);
1382 wakaba 1.11 ## XML5: Not defined yet.
1383    
1384 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1385     ## "entity in attribute value state". In this implementation, the
1386     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1387     ## implementation of the "consume a character reference" algorithm.
1388     $self->{entity_add} = 0x0027; # '
1389     $self->{prev_state} = $self->{state};
1390     $self->{state} = ENTITY_STATE;
1391     !!!next-input-character;
1392     redo A;
1393     } elsif ($self->{nc} == -1) {
1394     !!!parse-error (type => 'unclosed attribute value');
1395     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1396     !!!cp (103);
1397     $self->{last_stag_name} = $self->{ct}->{tag_name};
1398 wakaba 1.15
1399     $self->{state} = DATA_STATE;
1400     $self->{s_kwd} = '';
1401     ## reconsume
1402     !!!emit ($self->{ct}); # start tag
1403     redo A;
1404 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1405     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1406     if ($self->{ct}->{attributes}) {
1407     !!!cp (104);
1408     !!!parse-error (type => 'end tag attribute');
1409     } else {
1410     ## NOTE: This state should never be reached.
1411     !!!cp (105);
1412     }
1413 wakaba 1.15
1414     $self->{state} = DATA_STATE;
1415     $self->{s_kwd} = '';
1416     ## reconsume
1417     !!!emit ($self->{ct}); # end tag
1418     redo A;
1419     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1420     ## XML5: No parse error above; not defined yet.
1421     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1422     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1423     ## Reconsume.
1424     !!!emit ($self->{ct}); # ATTLIST
1425     redo A;
1426 wakaba 1.1 } else {
1427     die "$0: $self->{ct}->{type}: Unknown token type";
1428     }
1429     } else {
1430 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1431 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1432     !!!cp (106);
1433     ## XML5: Not a parse error.
1434     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1435     } else {
1436     !!!cp (106.1);
1437     }
1438 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1439     $self->{read_until}->($self->{ca}->{value},
1440 wakaba 1.11 q['&<],
1441 wakaba 1.1 length $self->{ca}->{value});
1442    
1443     ## Stay in the state
1444     !!!next-input-character;
1445     redo A;
1446     }
1447     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1448 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1449    
1450 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1451 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1452     !!!cp (107.1);
1453     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1454     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1455     } else {
1456     !!!cp (107);
1457     ## XML5: "Tag attribute name before state".
1458     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1459     }
1460 wakaba 1.1 !!!next-input-character;
1461     redo A;
1462     } elsif ($self->{nc} == 0x0026) { # &
1463     !!!cp (108);
1464 wakaba 1.11
1465     ## XML5: Not defined yet.
1466    
1467 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1468     ## "entity in attribute value state". In this implementation, the
1469     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1470     ## implementation of the "consume a character reference" algorithm.
1471     $self->{entity_add} = -1;
1472     $self->{prev_state} = $self->{state};
1473     $self->{state} = ENTITY_STATE;
1474     !!!next-input-character;
1475     redo A;
1476     } elsif ($self->{nc} == 0x003E) { # >
1477     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1478     !!!cp (109);
1479     $self->{last_stag_name} = $self->{ct}->{tag_name};
1480 wakaba 1.15
1481     $self->{state} = DATA_STATE;
1482     $self->{s_kwd} = '';
1483     !!!next-input-character;
1484     !!!emit ($self->{ct}); # start tag
1485     redo A;
1486 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1487     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488     if ($self->{ct}->{attributes}) {
1489     !!!cp (110);
1490     !!!parse-error (type => 'end tag attribute');
1491     } else {
1492     ## NOTE: This state should never be reached.
1493     !!!cp (111);
1494     }
1495 wakaba 1.15
1496     $self->{state} = DATA_STATE;
1497     $self->{s_kwd} = '';
1498     !!!next-input-character;
1499     !!!emit ($self->{ct}); # end tag
1500     redo A;
1501     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1502     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1503     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1504     !!!next-input-character;
1505     !!!emit ($self->{ct}); # ATTLIST
1506     redo A;
1507 wakaba 1.1 } else {
1508     die "$0: $self->{ct}->{type}: Unknown token type";
1509     }
1510     } elsif ($self->{nc} == -1) {
1511     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1512     !!!cp (112);
1513 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1514 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1515 wakaba 1.15
1516     $self->{state} = DATA_STATE;
1517     $self->{s_kwd} = '';
1518     ## reconsume
1519     !!!emit ($self->{ct}); # start tag
1520     redo A;
1521 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1522 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1523 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1524     if ($self->{ct}->{attributes}) {
1525     !!!cp (113);
1526     !!!parse-error (type => 'end tag attribute');
1527     } else {
1528     ## NOTE: This state should never be reached.
1529     !!!cp (114);
1530     }
1531 wakaba 1.15
1532     $self->{state} = DATA_STATE;
1533     $self->{s_kwd} = '';
1534     ## reconsume
1535     !!!emit ($self->{ct}); # end tag
1536     redo A;
1537     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1538     !!!parse-error (type => 'unclosed md'); ## TODO: type
1539     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1541     ## Reconsume.
1542     !!!emit ($self->{ct}); # ATTLIST
1543     redo A;
1544 wakaba 1.1 } else {
1545     die "$0: $self->{ct}->{type}: Unknown token type";
1546     }
1547     } else {
1548     if ({
1549     0x0022 => 1, # "
1550     0x0027 => 1, # '
1551     0x003D => 1, # =
1552     }->{$self->{nc}}) {
1553     !!!cp (115);
1554 wakaba 1.11 ## XML5: Not a parse error.
1555 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1556     } else {
1557     !!!cp (116);
1558     }
1559     $self->{ca}->{value} .= chr ($self->{nc});
1560     $self->{read_until}->($self->{ca}->{value},
1561     q["'=& >],
1562     length $self->{ca}->{value});
1563    
1564     ## Stay in the state
1565     !!!next-input-character;
1566     redo A;
1567     }
1568     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1569     if ($is_space->{$self->{nc}}) {
1570     !!!cp (118);
1571     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1572     !!!next-input-character;
1573     redo A;
1574     } elsif ($self->{nc} == 0x003E) { # >
1575     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1576     !!!cp (119);
1577     $self->{last_stag_name} = $self->{ct}->{tag_name};
1578     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580     if ($self->{ct}->{attributes}) {
1581     !!!cp (120);
1582     !!!parse-error (type => 'end tag attribute');
1583     } else {
1584     ## NOTE: This state should never be reached.
1585     !!!cp (121);
1586     }
1587     } else {
1588     die "$0: $self->{ct}->{type}: Unknown token type";
1589     }
1590     $self->{state} = DATA_STATE;
1591 wakaba 1.5 $self->{s_kwd} = '';
1592 wakaba 1.1 !!!next-input-character;
1593    
1594     !!!emit ($self->{ct}); # start tag or end tag
1595    
1596     redo A;
1597     } elsif ($self->{nc} == 0x002F) { # /
1598     !!!cp (122);
1599     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1600     !!!next-input-character;
1601     redo A;
1602     } elsif ($self->{nc} == -1) {
1603     !!!parse-error (type => 'unclosed tag');
1604     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1605     !!!cp (122.3);
1606     $self->{last_stag_name} = $self->{ct}->{tag_name};
1607     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1608     if ($self->{ct}->{attributes}) {
1609     !!!cp (122.1);
1610     !!!parse-error (type => 'end tag attribute');
1611     } else {
1612     ## NOTE: This state should never be reached.
1613     !!!cp (122.2);
1614     }
1615     } else {
1616     die "$0: $self->{ct}->{type}: Unknown token type";
1617     }
1618     $self->{state} = DATA_STATE;
1619 wakaba 1.5 $self->{s_kwd} = '';
1620 wakaba 1.1 ## Reconsume.
1621     !!!emit ($self->{ct}); # start tag or end tag
1622     redo A;
1623     } else {
1624     !!!cp ('124.1');
1625     !!!parse-error (type => 'no space between attributes');
1626     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1627     ## reconsume
1628     redo A;
1629     }
1630     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1631 wakaba 1.11 ## XML5: "Empty tag state".
1632    
1633 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1634     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1635     !!!cp ('124.2');
1636     !!!parse-error (type => 'nestc', token => $self->{ct});
1637     ## TODO: Different type than slash in start tag
1638     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1639     if ($self->{ct}->{attributes}) {
1640     !!!cp ('124.4');
1641     !!!parse-error (type => 'end tag attribute');
1642     } else {
1643     !!!cp ('124.5');
1644     }
1645     ## TODO: Test |<title></title/>|
1646     } else {
1647     !!!cp ('124.3');
1648     $self->{self_closing} = 1;
1649     }
1650    
1651     $self->{state} = DATA_STATE;
1652 wakaba 1.5 $self->{s_kwd} = '';
1653 wakaba 1.1 !!!next-input-character;
1654    
1655     !!!emit ($self->{ct}); # start tag or end tag
1656    
1657     redo A;
1658     } elsif ($self->{nc} == -1) {
1659     !!!parse-error (type => 'unclosed tag');
1660     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1661     !!!cp (124.7);
1662     $self->{last_stag_name} = $self->{ct}->{tag_name};
1663     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1664     if ($self->{ct}->{attributes}) {
1665     !!!cp (124.5);
1666     !!!parse-error (type => 'end tag attribute');
1667     } else {
1668     ## NOTE: This state should never be reached.
1669     !!!cp (124.6);
1670     }
1671     } else {
1672     die "$0: $self->{ct}->{type}: Unknown token type";
1673     }
1674 wakaba 1.11 ## XML5: "Tag attribute name before state".
1675 wakaba 1.1 $self->{state} = DATA_STATE;
1676 wakaba 1.5 $self->{s_kwd} = '';
1677 wakaba 1.1 ## Reconsume.
1678     !!!emit ($self->{ct}); # start tag or end tag
1679     redo A;
1680     } else {
1681     !!!cp ('124.4');
1682     !!!parse-error (type => 'nestc');
1683     ## TODO: This error type is wrong.
1684     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1685     ## Reconsume.
1686     redo A;
1687     }
1688     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1689 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1690    
1691 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1692     ## consumes characters one-by-one basis.
1693    
1694     if ($self->{nc} == 0x003E) { # >
1695 wakaba 1.13 if ($self->{in_subset}) {
1696     !!!cp (123);
1697     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1698     } else {
1699     !!!cp (124);
1700     $self->{state} = DATA_STATE;
1701     $self->{s_kwd} = '';
1702     }
1703 wakaba 1.1 !!!next-input-character;
1704    
1705     !!!emit ($self->{ct}); # comment
1706     redo A;
1707     } elsif ($self->{nc} == -1) {
1708 wakaba 1.13 if ($self->{in_subset}) {
1709     !!!cp (125.1);
1710     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1711     } else {
1712     !!!cp (125);
1713     $self->{state} = DATA_STATE;
1714     $self->{s_kwd} = '';
1715     }
1716 wakaba 1.1 ## reconsume
1717    
1718     !!!emit ($self->{ct}); # comment
1719     redo A;
1720     } else {
1721     !!!cp (126);
1722     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1723     $self->{read_until}->($self->{ct}->{data},
1724     q[>],
1725     length $self->{ct}->{data});
1726    
1727     ## Stay in the state.
1728     !!!next-input-character;
1729     redo A;
1730     }
1731     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1732 wakaba 1.14 ## XML5: "Markup declaration state".
1733 wakaba 1.1
1734     if ($self->{nc} == 0x002D) { # -
1735     !!!cp (133);
1736     $self->{state} = MD_HYPHEN_STATE;
1737     !!!next-input-character;
1738     redo A;
1739     } elsif ($self->{nc} == 0x0044 or # D
1740     $self->{nc} == 0x0064) { # d
1741     ## ASCII case-insensitive.
1742     !!!cp (130);
1743     $self->{state} = MD_DOCTYPE_STATE;
1744 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1745 wakaba 1.1 !!!next-input-character;
1746     redo A;
1747 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1748     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1749     $self->{is_xml}) and
1750 wakaba 1.1 $self->{nc} == 0x005B) { # [
1751     !!!cp (135.4);
1752     $self->{state} = MD_CDATA_STATE;
1753 wakaba 1.12 $self->{kwd} = '[';
1754 wakaba 1.1 !!!next-input-character;
1755     redo A;
1756     } else {
1757     !!!cp (136);
1758     }
1759    
1760     !!!parse-error (type => 'bogus comment',
1761     line => $self->{line_prev},
1762     column => $self->{column_prev} - 1);
1763     ## Reconsume.
1764     $self->{state} = BOGUS_COMMENT_STATE;
1765     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1766     line => $self->{line_prev},
1767     column => $self->{column_prev} - 1,
1768     };
1769     redo A;
1770     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1771     if ($self->{nc} == 0x002D) { # -
1772     !!!cp (127);
1773     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1774     line => $self->{line_prev},
1775     column => $self->{column_prev} - 2,
1776     };
1777 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1778 wakaba 1.1 !!!next-input-character;
1779     redo A;
1780     } else {
1781     !!!cp (128);
1782     !!!parse-error (type => 'bogus comment',
1783     line => $self->{line_prev},
1784     column => $self->{column_prev} - 2);
1785     $self->{state} = BOGUS_COMMENT_STATE;
1786     ## Reconsume.
1787     $self->{ct} = {type => COMMENT_TOKEN,
1788     data => '-',
1789     line => $self->{line_prev},
1790     column => $self->{column_prev} - 2,
1791     };
1792     redo A;
1793     }
1794     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1795     ## ASCII case-insensitive.
1796     if ($self->{nc} == [
1797     undef,
1798     0x004F, # O
1799     0x0043, # C
1800     0x0054, # T
1801     0x0059, # Y
1802     0x0050, # P
1803 wakaba 1.12 ]->[length $self->{kwd}] or
1804 wakaba 1.1 $self->{nc} == [
1805     undef,
1806     0x006F, # o
1807     0x0063, # c
1808     0x0074, # t
1809     0x0079, # y
1810     0x0070, # p
1811 wakaba 1.12 ]->[length $self->{kwd}]) {
1812 wakaba 1.1 !!!cp (131);
1813     ## Stay in the state.
1814 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1815 wakaba 1.1 !!!next-input-character;
1816     redo A;
1817 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1818 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1819     $self->{nc} == 0x0065)) { # e
1820 wakaba 1.12 if ($self->{is_xml} and
1821     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1822 wakaba 1.10 !!!cp (129);
1823     ## XML5: case-sensitive.
1824     !!!parse-error (type => 'lowercase keyword', ## TODO
1825     text => 'DOCTYPE',
1826     line => $self->{line_prev},
1827     column => $self->{column_prev} - 5);
1828     } else {
1829     !!!cp (129.1);
1830     }
1831 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1832     $self->{ct} = {type => DOCTYPE_TOKEN,
1833     quirks => 1,
1834     line => $self->{line_prev},
1835     column => $self->{column_prev} - 7,
1836     };
1837     !!!next-input-character;
1838     redo A;
1839     } else {
1840     !!!cp (132);
1841     !!!parse-error (type => 'bogus comment',
1842     line => $self->{line_prev},
1843 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1844 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1845     ## Reconsume.
1846     $self->{ct} = {type => COMMENT_TOKEN,
1847 wakaba 1.12 data => $self->{kwd},
1848 wakaba 1.1 line => $self->{line_prev},
1849 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1850 wakaba 1.1 };
1851     redo A;
1852     }
1853     } elsif ($self->{state} == MD_CDATA_STATE) {
1854     if ($self->{nc} == {
1855     '[' => 0x0043, # C
1856     '[C' => 0x0044, # D
1857     '[CD' => 0x0041, # A
1858     '[CDA' => 0x0054, # T
1859     '[CDAT' => 0x0041, # A
1860 wakaba 1.12 }->{$self->{kwd}}) {
1861 wakaba 1.1 !!!cp (135.1);
1862     ## Stay in the state.
1863 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1864 wakaba 1.1 !!!next-input-character;
1865     redo A;
1866 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1867 wakaba 1.1 $self->{nc} == 0x005B) { # [
1868 wakaba 1.6 if ($self->{is_xml} and
1869     not $self->{tainted} and
1870     @{$self->{open_elements} or []} == 0) {
1871 wakaba 1.8 !!!cp (135.2);
1872 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1873     line => $self->{line_prev},
1874     column => $self->{column_prev} - 7);
1875     $self->{tainted} = 1;
1876 wakaba 1.8 } else {
1877     !!!cp (135.21);
1878 wakaba 1.6 }
1879    
1880 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1881     data => '',
1882     line => $self->{line_prev},
1883     column => $self->{column_prev} - 7};
1884     $self->{state} = CDATA_SECTION_STATE;
1885     !!!next-input-character;
1886     redo A;
1887     } else {
1888     !!!cp (135.3);
1889     !!!parse-error (type => 'bogus comment',
1890     line => $self->{line_prev},
1891 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1892 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1893     ## Reconsume.
1894     $self->{ct} = {type => COMMENT_TOKEN,
1895 wakaba 1.12 data => $self->{kwd},
1896 wakaba 1.1 line => $self->{line_prev},
1897 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1898 wakaba 1.1 };
1899     redo A;
1900     }
1901     } elsif ($self->{state} == COMMENT_START_STATE) {
1902     if ($self->{nc} == 0x002D) { # -
1903     !!!cp (137);
1904     $self->{state} = COMMENT_START_DASH_STATE;
1905     !!!next-input-character;
1906     redo A;
1907     } elsif ($self->{nc} == 0x003E) { # >
1908     !!!parse-error (type => 'bogus comment');
1909 wakaba 1.13 if ($self->{in_subset}) {
1910     !!!cp (138.1);
1911     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1912     } else {
1913     !!!cp (138);
1914     $self->{state} = DATA_STATE;
1915     $self->{s_kwd} = '';
1916     }
1917 wakaba 1.1 !!!next-input-character;
1918    
1919     !!!emit ($self->{ct}); # comment
1920    
1921     redo A;
1922     } elsif ($self->{nc} == -1) {
1923     !!!parse-error (type => 'unclosed comment');
1924 wakaba 1.13 if ($self->{in_subset}) {
1925     !!!cp (139.1);
1926     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1927     } else {
1928     !!!cp (139);
1929     $self->{state} = DATA_STATE;
1930     $self->{s_kwd} = '';
1931     }
1932 wakaba 1.1 ## reconsume
1933    
1934     !!!emit ($self->{ct}); # comment
1935    
1936     redo A;
1937     } else {
1938     !!!cp (140);
1939     $self->{ct}->{data} # comment
1940     .= chr ($self->{nc});
1941     $self->{state} = COMMENT_STATE;
1942     !!!next-input-character;
1943     redo A;
1944     }
1945     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1946     if ($self->{nc} == 0x002D) { # -
1947     !!!cp (141);
1948     $self->{state} = COMMENT_END_STATE;
1949     !!!next-input-character;
1950     redo A;
1951     } elsif ($self->{nc} == 0x003E) { # >
1952     !!!parse-error (type => 'bogus comment');
1953 wakaba 1.13 if ($self->{in_subset}) {
1954     !!!cp (142.1);
1955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1956     } else {
1957     !!!cp (142);
1958     $self->{state} = DATA_STATE;
1959     $self->{s_kwd} = '';
1960     }
1961 wakaba 1.1 !!!next-input-character;
1962    
1963     !!!emit ($self->{ct}); # comment
1964    
1965     redo A;
1966     } elsif ($self->{nc} == -1) {
1967     !!!parse-error (type => 'unclosed comment');
1968 wakaba 1.13 if ($self->{in_subset}) {
1969     !!!cp (143.1);
1970     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1971     } else {
1972     !!!cp (143);
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     }
1976 wakaba 1.1 ## reconsume
1977    
1978     !!!emit ($self->{ct}); # comment
1979    
1980     redo A;
1981     } else {
1982     !!!cp (144);
1983     $self->{ct}->{data} # comment
1984     .= '-' . chr ($self->{nc});
1985     $self->{state} = COMMENT_STATE;
1986     !!!next-input-character;
1987     redo A;
1988     }
1989     } elsif ($self->{state} == COMMENT_STATE) {
1990 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
1991    
1992 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1993     !!!cp (145);
1994     $self->{state} = COMMENT_END_DASH_STATE;
1995     !!!next-input-character;
1996     redo A;
1997     } elsif ($self->{nc} == -1) {
1998     !!!parse-error (type => 'unclosed comment');
1999 wakaba 1.13 if ($self->{in_subset}) {
2000     !!!cp (146.1);
2001     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2002     } else {
2003     !!!cp (146);
2004     $self->{state} = DATA_STATE;
2005     $self->{s_kwd} = '';
2006     }
2007 wakaba 1.1 ## reconsume
2008    
2009     !!!emit ($self->{ct}); # comment
2010    
2011     redo A;
2012     } else {
2013     !!!cp (147);
2014     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015     $self->{read_until}->($self->{ct}->{data},
2016     q[-],
2017     length $self->{ct}->{data});
2018    
2019     ## Stay in the state
2020     !!!next-input-character;
2021     redo A;
2022     }
2023     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2024 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2025 wakaba 1.10
2026 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2027     !!!cp (148);
2028     $self->{state} = COMMENT_END_STATE;
2029     !!!next-input-character;
2030     redo A;
2031     } elsif ($self->{nc} == -1) {
2032     !!!parse-error (type => 'unclosed comment');
2033 wakaba 1.13 if ($self->{in_subset}) {
2034     !!!cp (149.1);
2035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2036     } else {
2037     !!!cp (149);
2038     $self->{state} = DATA_STATE;
2039     $self->{s_kwd} = '';
2040     }
2041 wakaba 1.1 ## reconsume
2042    
2043     !!!emit ($self->{ct}); # comment
2044    
2045     redo A;
2046     } else {
2047     !!!cp (150);
2048     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2049     $self->{state} = COMMENT_STATE;
2050     !!!next-input-character;
2051     redo A;
2052     }
2053     } elsif ($self->{state} == COMMENT_END_STATE) {
2054 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2055    
2056 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2057 wakaba 1.13 if ($self->{in_subset}) {
2058     !!!cp (151.1);
2059     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2060     } else {
2061     !!!cp (151);
2062     $self->{state} = DATA_STATE;
2063     $self->{s_kwd} = '';
2064     }
2065 wakaba 1.1 !!!next-input-character;
2066    
2067     !!!emit ($self->{ct}); # comment
2068    
2069     redo A;
2070     } elsif ($self->{nc} == 0x002D) { # -
2071     !!!cp (152);
2072 wakaba 1.10 ## XML5: Not a parse error.
2073 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2074     line => $self->{line_prev},
2075     column => $self->{column_prev});
2076     $self->{ct}->{data} .= '-'; # comment
2077     ## Stay in the state
2078     !!!next-input-character;
2079     redo A;
2080     } elsif ($self->{nc} == -1) {
2081     !!!parse-error (type => 'unclosed comment');
2082 wakaba 1.13 if ($self->{in_subset}) {
2083     !!!cp (153.1);
2084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085     } else {
2086     !!!cp (153);
2087     $self->{state} = DATA_STATE;
2088     $self->{s_kwd} = '';
2089     }
2090 wakaba 1.1 ## reconsume
2091    
2092     !!!emit ($self->{ct}); # comment
2093    
2094     redo A;
2095     } else {
2096     !!!cp (154);
2097 wakaba 1.10 ## XML5: Not a parse error.
2098 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev});
2101     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2102     $self->{state} = COMMENT_STATE;
2103     !!!next-input-character;
2104     redo A;
2105     }
2106     } elsif ($self->{state} == DOCTYPE_STATE) {
2107     if ($is_space->{$self->{nc}}) {
2108     !!!cp (155);
2109     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2110     !!!next-input-character;
2111     redo A;
2112     } else {
2113     !!!cp (156);
2114 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2115 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2116     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2117     ## reconsume
2118     redo A;
2119     }
2120     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2121 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2122    
2123 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2124     !!!cp (157);
2125     ## Stay in the state
2126     !!!next-input-character;
2127     redo A;
2128     } elsif ($self->{nc} == 0x003E) { # >
2129     !!!cp (158);
2130 wakaba 1.12 ## XML5: No parse error.
2131 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2132     $self->{state} = DATA_STATE;
2133 wakaba 1.5 $self->{s_kwd} = '';
2134 wakaba 1.1 !!!next-input-character;
2135    
2136     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2137    
2138     redo A;
2139     } elsif ($self->{nc} == -1) {
2140     !!!cp (159);
2141     !!!parse-error (type => 'no DOCTYPE name');
2142     $self->{state} = DATA_STATE;
2143 wakaba 1.5 $self->{s_kwd} = '';
2144 wakaba 1.1 ## reconsume
2145    
2146     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2147    
2148     redo A;
2149 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2150     !!!cp (159.1);
2151     !!!parse-error (type => 'no DOCTYPE name');
2152     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2153 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2154     $self->{in_subset} = 1;
2155 wakaba 1.12 !!!next-input-character;
2156 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2157 wakaba 1.12 redo A;
2158 wakaba 1.1 } else {
2159     !!!cp (160);
2160     $self->{ct}->{name} = chr $self->{nc};
2161     delete $self->{ct}->{quirks};
2162     $self->{state} = DOCTYPE_NAME_STATE;
2163     !!!next-input-character;
2164     redo A;
2165     }
2166     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2167 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2168    
2169     ## ISSUE: Redundant "First," in the spec.
2170    
2171 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2172     !!!cp (161);
2173     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2174     !!!next-input-character;
2175     redo A;
2176     } elsif ($self->{nc} == 0x003E) { # >
2177     !!!cp (162);
2178     $self->{state} = DATA_STATE;
2179 wakaba 1.5 $self->{s_kwd} = '';
2180 wakaba 1.1 !!!next-input-character;
2181    
2182     !!!emit ($self->{ct}); # DOCTYPE
2183    
2184     redo A;
2185     } elsif ($self->{nc} == -1) {
2186     !!!cp (163);
2187     !!!parse-error (type => 'unclosed DOCTYPE');
2188     $self->{state} = DATA_STATE;
2189 wakaba 1.5 $self->{s_kwd} = '';
2190 wakaba 1.1 ## reconsume
2191    
2192     $self->{ct}->{quirks} = 1;
2193     !!!emit ($self->{ct}); # DOCTYPE
2194    
2195     redo A;
2196 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2197     !!!cp (163.1);
2198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2199 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2200     $self->{in_subset} = 1;
2201 wakaba 1.12 !!!next-input-character;
2202 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2203 wakaba 1.12 redo A;
2204 wakaba 1.1 } else {
2205     !!!cp (164);
2206     $self->{ct}->{name}
2207     .= chr ($self->{nc}); # DOCTYPE
2208     ## Stay in the state
2209     !!!next-input-character;
2210     redo A;
2211     }
2212     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2213 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2214     ## state", but implemented differently.
2215    
2216 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2217     !!!cp (165);
2218     ## Stay in the state
2219     !!!next-input-character;
2220     redo A;
2221     } elsif ($self->{nc} == 0x003E) { # >
2222 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2223     !!!cp (166);
2224     $self->{state} = DATA_STATE;
2225     $self->{s_kwd} = '';
2226     } else {
2227     !!!cp (166.1);
2228     !!!parse-error (type => 'no md def'); ## TODO: type
2229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2230     }
2231    
2232 wakaba 1.1 !!!next-input-character;
2233 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2234 wakaba 1.1 redo A;
2235     } elsif ($self->{nc} == -1) {
2236 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2237     !!!cp (167);
2238     !!!parse-error (type => 'unclosed DOCTYPE');
2239     $self->{state} = DATA_STATE;
2240     $self->{s_kwd} = '';
2241     $self->{ct}->{quirks} = 1;
2242     } else {
2243     !!!cp (167.12);
2244     !!!parse-error (type => 'unclosed md'); ## TODO: type
2245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2246     }
2247    
2248     ## Reconsume.
2249     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2250 wakaba 1.1 redo A;
2251     } elsif ($self->{nc} == 0x0050 or # P
2252     $self->{nc} == 0x0070) { # p
2253 wakaba 1.12 !!!cp (167.1);
2254 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2255 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2256 wakaba 1.1 !!!next-input-character;
2257     redo A;
2258     } elsif ($self->{nc} == 0x0053 or # S
2259     $self->{nc} == 0x0073) { # s
2260 wakaba 1.12 !!!cp (167.2);
2261 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2262 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2263     !!!next-input-character;
2264     redo A;
2265 wakaba 1.16 ## TODO: " and ' for ENTITY
2266     } elsif ($self->{is_xml} and
2267     $self->{ct}->{type} == DOCTYPE_TOKEN and
2268     $self->{nc} == 0x005B) { # [
2269 wakaba 1.12 !!!cp (167.3);
2270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2271     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2272 wakaba 1.13 $self->{in_subset} = 1;
2273 wakaba 1.1 !!!next-input-character;
2274 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2275 wakaba 1.1 redo A;
2276     } else {
2277 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2278    
2279     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2280     !!!cp (180);
2281     $self->{ct}->{quirks} = 1;
2282     $self->{state} = BOGUS_DOCTYPE_STATE;
2283     } else {
2284     !!!cp (180.1);
2285     $self->{state} = BOGUS_MD_STATE;
2286     }
2287 wakaba 1.1
2288     !!!next-input-character;
2289     redo A;
2290     }
2291     } elsif ($self->{state} == PUBLIC_STATE) {
2292     ## ASCII case-insensitive
2293     if ($self->{nc} == [
2294     undef,
2295     0x0055, # U
2296     0x0042, # B
2297     0x004C, # L
2298     0x0049, # I
2299 wakaba 1.12 ]->[length $self->{kwd}] or
2300 wakaba 1.1 $self->{nc} == [
2301     undef,
2302     0x0075, # u
2303     0x0062, # b
2304     0x006C, # l
2305     0x0069, # i
2306 wakaba 1.12 ]->[length $self->{kwd}]) {
2307 wakaba 1.1 !!!cp (175);
2308     ## Stay in the state.
2309 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2310 wakaba 1.1 !!!next-input-character;
2311     redo A;
2312 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2313 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2314     $self->{nc} == 0x0063)) { # c
2315 wakaba 1.12 if ($self->{is_xml} and
2316     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2317     !!!cp (168.1);
2318     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2319     text => 'PUBLIC',
2320     line => $self->{line_prev},
2321     column => $self->{column_prev} - 4);
2322     } else {
2323     !!!cp (168);
2324     }
2325 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2326     !!!next-input-character;
2327     redo A;
2328     } else {
2329 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2330 wakaba 1.1 line => $self->{line_prev},
2331 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2332 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2333     !!!cp (169);
2334     $self->{ct}->{quirks} = 1;
2335     $self->{state} = BOGUS_DOCTYPE_STATE;
2336     } else {
2337     !!!cp (169.1);
2338     $self->{state} = BOGUS_MD_STATE;
2339     }
2340 wakaba 1.1 ## Reconsume.
2341     redo A;
2342     }
2343     } elsif ($self->{state} == SYSTEM_STATE) {
2344     ## ASCII case-insensitive
2345     if ($self->{nc} == [
2346     undef,
2347     0x0059, # Y
2348     0x0053, # S
2349     0x0054, # T
2350     0x0045, # E
2351 wakaba 1.12 ]->[length $self->{kwd}] or
2352 wakaba 1.1 $self->{nc} == [
2353     undef,
2354     0x0079, # y
2355     0x0073, # s
2356     0x0074, # t
2357     0x0065, # e
2358 wakaba 1.12 ]->[length $self->{kwd}]) {
2359 wakaba 1.1 !!!cp (170);
2360     ## Stay in the state.
2361 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2362 wakaba 1.1 !!!next-input-character;
2363     redo A;
2364 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2365 wakaba 1.1 ($self->{nc} == 0x004D or # M
2366     $self->{nc} == 0x006D)) { # m
2367 wakaba 1.12 if ($self->{is_xml} and
2368     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2369     !!!cp (171.1);
2370     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2371     text => 'SYSTEM',
2372     line => $self->{line_prev},
2373     column => $self->{column_prev} - 4);
2374     } else {
2375     !!!cp (171);
2376     }
2377 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2378     !!!next-input-character;
2379     redo A;
2380     } else {
2381 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2382 wakaba 1.1 line => $self->{line_prev},
2383 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2384 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2385     !!!cp (172);
2386     $self->{ct}->{quirks} = 1;
2387     $self->{state} = BOGUS_DOCTYPE_STATE;
2388     } else {
2389     !!!cp (172.1);
2390     $self->{state} = BOGUS_MD_STATE;
2391     }
2392 wakaba 1.1 ## Reconsume.
2393     redo A;
2394     }
2395     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2396     if ($is_space->{$self->{nc}}) {
2397     !!!cp (181);
2398     ## Stay in the state
2399     !!!next-input-character;
2400     redo A;
2401     } elsif ($self->{nc} eq 0x0022) { # "
2402     !!!cp (182);
2403     $self->{ct}->{pubid} = ''; # DOCTYPE
2404     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2405     !!!next-input-character;
2406     redo A;
2407     } elsif ($self->{nc} eq 0x0027) { # '
2408     !!!cp (183);
2409     $self->{ct}->{pubid} = ''; # DOCTYPE
2410     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2411     !!!next-input-character;
2412     redo A;
2413     } elsif ($self->{nc} eq 0x003E) { # >
2414     !!!parse-error (type => 'no PUBLIC literal');
2415 wakaba 1.16
2416     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2417     !!!cp (184);
2418     $self->{state} = DATA_STATE;
2419     $self->{s_kwd} = '';
2420     $self->{ct}->{quirks} = 1;
2421     } else {
2422     !!!cp (184.1);
2423     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2424     }
2425    
2426 wakaba 1.1 !!!next-input-character;
2427 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2428 wakaba 1.1 redo A;
2429     } elsif ($self->{nc} == -1) {
2430 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2431     !!!cp (185);
2432     !!!parse-error (type => 'unclosed DOCTYPE');
2433     $self->{state} = DATA_STATE;
2434     $self->{s_kwd} = '';
2435     $self->{ct}->{quirks} = 1;
2436     } else {
2437     !!!cp (185.1);
2438     !!!parse-error (type => 'unclosed md'); ## TODO: type
2439     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2440     }
2441    
2442 wakaba 1.1 ## reconsume
2443     !!!emit ($self->{ct}); # DOCTYPE
2444     redo A;
2445 wakaba 1.16 } elsif ($self->{is_xml} and
2446     $self->{ct}->{type} == DOCTYPE_TOKEN and
2447     $self->{nc} == 0x005B) { # [
2448 wakaba 1.12 !!!cp (186.1);
2449     !!!parse-error (type => 'no PUBLIC literal');
2450     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2452 wakaba 1.13 $self->{in_subset} = 1;
2453 wakaba 1.12 !!!next-input-character;
2454 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2455 wakaba 1.12 redo A;
2456 wakaba 1.1 } else {
2457     !!!parse-error (type => 'string after PUBLIC');
2458    
2459 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2460     !!!cp (186);
2461     $self->{ct}->{quirks} = 1;
2462     $self->{state} = BOGUS_DOCTYPE_STATE;
2463     } else {
2464     !!!cp (186.2);
2465     $self->{state} = BOGUS_MD_STATE;
2466     }
2467    
2468 wakaba 1.1 !!!next-input-character;
2469     redo A;
2470     }
2471     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2472     if ($self->{nc} == 0x0022) { # "
2473     !!!cp (187);
2474     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2475     !!!next-input-character;
2476     redo A;
2477     } elsif ($self->{nc} == 0x003E) { # >
2478     !!!parse-error (type => 'unclosed PUBLIC literal');
2479    
2480 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2481     !!!cp (188);
2482     $self->{state} = DATA_STATE;
2483     $self->{s_kwd} = '';
2484     $self->{ct}->{quirks} = 1;
2485     } else {
2486     !!!cp (188.1);
2487     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2488     }
2489    
2490 wakaba 1.1 !!!next-input-character;
2491 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2492 wakaba 1.1 redo A;
2493     } elsif ($self->{nc} == -1) {
2494     !!!parse-error (type => 'unclosed PUBLIC literal');
2495    
2496 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2497     !!!cp (189);
2498     $self->{state} = DATA_STATE;
2499     $self->{s_kwd} = '';
2500     $self->{ct}->{quirks} = 1;
2501     } else {
2502     !!!cp (189.1);
2503     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2504     }
2505    
2506     ## Reconsume.
2507 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2508     redo A;
2509     } else {
2510     !!!cp (190);
2511 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2512 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2513     length $self->{ct}->{pubid});
2514    
2515     ## Stay in the state
2516     !!!next-input-character;
2517     redo A;
2518     }
2519     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2520     if ($self->{nc} == 0x0027) { # '
2521     !!!cp (191);
2522     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2523     !!!next-input-character;
2524     redo A;
2525     } elsif ($self->{nc} == 0x003E) { # >
2526     !!!parse-error (type => 'unclosed PUBLIC literal');
2527    
2528 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2529     !!!cp (192);
2530     $self->{state} = DATA_STATE;
2531     $self->{s_kwd} = '';
2532     $self->{ct}->{quirks} = 1;
2533     } else {
2534     !!!cp (192.1);
2535     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2536     }
2537    
2538 wakaba 1.1 !!!next-input-character;
2539 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2540 wakaba 1.1 redo A;
2541     } elsif ($self->{nc} == -1) {
2542     !!!parse-error (type => 'unclosed PUBLIC literal');
2543    
2544 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2545     !!!cp (193);
2546     $self->{state} = DATA_STATE;
2547     $self->{s_kwd} = '';
2548     $self->{ct}->{quirks} = 1;
2549     } else {
2550     !!!cp (193.1);
2551     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552     }
2553    
2554 wakaba 1.1 ## reconsume
2555 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2556 wakaba 1.1 redo A;
2557     } else {
2558     !!!cp (194);
2559 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2560 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2561     length $self->{ct}->{pubid});
2562    
2563     ## Stay in the state
2564     !!!next-input-character;
2565     redo A;
2566     }
2567     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2568     if ($is_space->{$self->{nc}}) {
2569     !!!cp (195);
2570     ## Stay in the state
2571     !!!next-input-character;
2572     redo A;
2573     } elsif ($self->{nc} == 0x0022) { # "
2574     !!!cp (196);
2575 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2577     !!!next-input-character;
2578     redo A;
2579     } elsif ($self->{nc} == 0x0027) { # '
2580     !!!cp (197);
2581 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2582 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2583     !!!next-input-character;
2584     redo A;
2585     } elsif ($self->{nc} == 0x003E) { # >
2586 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2587     if ($self->{is_xml}) {
2588     !!!cp (198.1);
2589     !!!parse-error (type => 'no SYSTEM literal');
2590     } else {
2591     !!!cp (198);
2592     }
2593     $self->{state} = DATA_STATE;
2594     $self->{s_kwd} = '';
2595 wakaba 1.12 } else {
2596 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2597     !!!cp (198.2);
2598     } else {
2599     !!!cp (198.3);
2600     !!!parse-error (type => 'no SYSTEM literal');
2601     }
2602     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2603 wakaba 1.12 }
2604 wakaba 1.16
2605 wakaba 1.1 !!!next-input-character;
2606 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2607 wakaba 1.1 redo A;
2608     } elsif ($self->{nc} == -1) {
2609 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2610     !!!cp (199);
2611     !!!parse-error (type => 'unclosed DOCTYPE');
2612    
2613     $self->{state} = DATA_STATE;
2614     $self->{s_kwd} = '';
2615     $self->{ct}->{quirks} = 1;
2616     } else {
2617     !!!parse-error (type => 'unclosed md'); ## TODO: type
2618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2619     }
2620    
2621 wakaba 1.1 ## reconsume
2622 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2623 wakaba 1.1 redo A;
2624 wakaba 1.16 } elsif ($self->{is_xml} and
2625     $self->{ct}->{type} == DOCTYPE_TOKEN and
2626     $self->{nc} == 0x005B) { # [
2627 wakaba 1.12 !!!cp (200.1);
2628     !!!parse-error (type => 'no SYSTEM literal');
2629     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2630     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2631 wakaba 1.13 $self->{in_subset} = 1;
2632 wakaba 1.12 !!!next-input-character;
2633 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2634 wakaba 1.12 redo A;
2635 wakaba 1.1 } else {
2636     !!!parse-error (type => 'string after PUBLIC literal');
2637    
2638 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2639     !!!cp (200);
2640     $self->{ct}->{quirks} = 1;
2641     $self->{state} = BOGUS_DOCTYPE_STATE;
2642     } else {
2643     !!!cp (200.2);
2644     $self->{state} = BOGUS_MD_STATE;
2645     }
2646    
2647 wakaba 1.1 !!!next-input-character;
2648     redo A;
2649     }
2650     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2651     if ($is_space->{$self->{nc}}) {
2652     !!!cp (201);
2653     ## Stay in the state
2654     !!!next-input-character;
2655     redo A;
2656     } elsif ($self->{nc} == 0x0022) { # "
2657     !!!cp (202);
2658     $self->{ct}->{sysid} = ''; # DOCTYPE
2659     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2660     !!!next-input-character;
2661     redo A;
2662     } elsif ($self->{nc} == 0x0027) { # '
2663     !!!cp (203);
2664     $self->{ct}->{sysid} = ''; # DOCTYPE
2665     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2666     !!!next-input-character;
2667     redo A;
2668     } elsif ($self->{nc} == 0x003E) { # >
2669     !!!parse-error (type => 'no SYSTEM literal');
2670     !!!next-input-character;
2671    
2672 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2673     !!!cp (204);
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     $self->{ct}->{quirks} = 1;
2677     } else {
2678     !!!cp (204.1);
2679     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2680     }
2681 wakaba 1.1
2682 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2683 wakaba 1.1 redo A;
2684     } elsif ($self->{nc} == -1) {
2685 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2686     !!!cp (205);
2687     !!!parse-error (type => 'unclosed DOCTYPE');
2688     $self->{state} = DATA_STATE;
2689     $self->{s_kwd} = '';
2690     $self->{ct}->{quirks} = 1;
2691     } else {
2692     !!!cp (205.1);
2693     !!!parse-error (type => 'unclosed md'); ## TODO: type
2694     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2695     }
2696    
2697 wakaba 1.1 ## reconsume
2698 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2699 wakaba 1.1 redo A;
2700 wakaba 1.16 } elsif ($self->{is_xml} and
2701     $self->{ct}->{type} == DOCTYPE_TOKEN and
2702     $self->{nc} == 0x005B) { # [
2703 wakaba 1.12 !!!cp (206.1);
2704     !!!parse-error (type => 'no SYSTEM literal');
2705    
2706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2708 wakaba 1.13 $self->{in_subset} = 1;
2709 wakaba 1.12 !!!next-input-character;
2710 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2711 wakaba 1.12 redo A;
2712 wakaba 1.1 } else {
2713     !!!parse-error (type => 'string after SYSTEM');
2714    
2715 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2716     !!!cp (206);
2717     $self->{ct}->{quirks} = 1;
2718     $self->{state} = BOGUS_DOCTYPE_STATE;
2719     } else {
2720     !!!cp (206.2);
2721     $self->{state} = BOGUS_MD_STATE;
2722     }
2723    
2724 wakaba 1.1 !!!next-input-character;
2725     redo A;
2726     }
2727     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2728     if ($self->{nc} == 0x0022) { # "
2729     !!!cp (207);
2730     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2731     !!!next-input-character;
2732     redo A;
2733 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2734 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2735    
2736 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2737     !!!cp (208);
2738     $self->{state} = DATA_STATE;
2739     $self->{s_kwd} = '';
2740     $self->{ct}->{quirks} = 1;
2741     } else {
2742     !!!cp (208.1);
2743     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2744     }
2745    
2746 wakaba 1.1 !!!next-input-character;
2747 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2748 wakaba 1.1 redo A;
2749     } elsif ($self->{nc} == -1) {
2750     !!!parse-error (type => 'unclosed SYSTEM literal');
2751    
2752 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2753     !!!cp (209);
2754     $self->{state} = DATA_STATE;
2755     $self->{s_kwd} = '';
2756     $self->{ct}->{quirks} = 1;
2757     } else {
2758     !!!cp (209.1);
2759     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2760     }
2761    
2762 wakaba 1.1 ## reconsume
2763 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2764 wakaba 1.1 redo A;
2765     } else {
2766     !!!cp (210);
2767 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2768 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2769     length $self->{ct}->{sysid});
2770    
2771     ## Stay in the state
2772     !!!next-input-character;
2773     redo A;
2774     }
2775     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2776     if ($self->{nc} == 0x0027) { # '
2777     !!!cp (211);
2778     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2779     !!!next-input-character;
2780     redo A;
2781 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2782 wakaba 1.1 !!!cp (212);
2783     !!!parse-error (type => 'unclosed SYSTEM literal');
2784    
2785     $self->{state} = DATA_STATE;
2786 wakaba 1.5 $self->{s_kwd} = '';
2787 wakaba 1.1 !!!next-input-character;
2788    
2789     $self->{ct}->{quirks} = 1;
2790     !!!emit ($self->{ct}); # DOCTYPE
2791    
2792     redo A;
2793     } elsif ($self->{nc} == -1) {
2794     !!!parse-error (type => 'unclosed SYSTEM literal');
2795    
2796 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2797     !!!cp (213);
2798     $self->{state} = DATA_STATE;
2799     $self->{s_kwd} = '';
2800     $self->{ct}->{quirks} = 1;
2801     } else {
2802     !!!cp (213.1);
2803     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2804     }
2805    
2806 wakaba 1.1 ## reconsume
2807 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2808 wakaba 1.1 redo A;
2809     } else {
2810     !!!cp (214);
2811 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2812 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2813     length $self->{ct}->{sysid});
2814    
2815     ## Stay in the state
2816     !!!next-input-character;
2817     redo A;
2818     }
2819     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2820     if ($is_space->{$self->{nc}}) {
2821 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2822     !!!cp (215.1);
2823     $self->{state} = BEFORE_NDATA_STATE;
2824     } else {
2825     !!!cp (215);
2826     ## Stay in the state
2827     }
2828 wakaba 1.1 !!!next-input-character;
2829     redo A;
2830     } elsif ($self->{nc} == 0x003E) { # >
2831 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2832     !!!cp (216);
2833     $self->{state} = DATA_STATE;
2834     $self->{s_kwd} = '';
2835     } else {
2836     !!!cp (216.1);
2837     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2838     }
2839    
2840 wakaba 1.1 !!!next-input-character;
2841 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2842 wakaba 1.1 redo A;
2843 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2844     ($self->{nc} == 0x004E or # N
2845     $self->{nc} == 0x006E)) { # n
2846     !!!cp (216.2);
2847     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2848     $self->{state} = NDATA_STATE;
2849     $self->{kwd} = chr $self->{nc};
2850     !!!next-input-character;
2851     redo A;
2852 wakaba 1.1 } elsif ($self->{nc} == -1) {
2853 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2854     !!!cp (217);
2855     !!!parse-error (type => 'unclosed DOCTYPE');
2856     $self->{state} = DATA_STATE;
2857     $self->{s_kwd} = '';
2858     $self->{ct}->{quirks} = 1;
2859     } else {
2860     !!!cp (217.1);
2861     !!!parse-error (type => 'unclosed md'); ## TODO: type
2862     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863     }
2864    
2865 wakaba 1.1 ## reconsume
2866 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867 wakaba 1.1 redo A;
2868 wakaba 1.16 } elsif ($self->{is_xml} and
2869     $self->{ct}->{type} == DOCTYPE_TOKEN and
2870     $self->{nc} == 0x005B) { # [
2871 wakaba 1.12 !!!cp (218.1);
2872     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2873     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2874 wakaba 1.13 $self->{in_subset} = 1;
2875 wakaba 1.12 !!!next-input-character;
2876 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2877 wakaba 1.12 redo A;
2878 wakaba 1.1 } else {
2879     !!!parse-error (type => 'string after SYSTEM literal');
2880    
2881 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2882     !!!cp (218);
2883     #$self->{ct}->{quirks} = 1;
2884     $self->{state} = BOGUS_DOCTYPE_STATE;
2885     } else {
2886     !!!cp (218.2);
2887     $self->{state} = BOGUS_MD_STATE;
2888     }
2889    
2890 wakaba 1.1 !!!next-input-character;
2891     redo A;
2892     }
2893 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2894     if ($is_space->{$self->{nc}}) {
2895     !!!cp (218.3);
2896     ## Stay in the state.
2897     !!!next-input-character;
2898     redo A;
2899     } elsif ($self->{nc} == 0x003E) { # >
2900     !!!cp (218.4);
2901     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902     !!!next-input-character;
2903     !!!emit ($self->{ct}); # ENTITY
2904     redo A;
2905     } elsif ($self->{nc} == 0x004E or # N
2906     $self->{nc} == 0x006E) { # n
2907     !!!cp (218.5);
2908     $self->{state} = NDATA_STATE;
2909     $self->{kwd} = chr $self->{nc};
2910     !!!next-input-character;
2911     redo A;
2912     } elsif ($self->{nc} == -1) {
2913     !!!cp (218.6);
2914     !!!parse-error (type => 'unclosed md'); ## TODO: type
2915     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2916     ## reconsume
2917     !!!emit ($self->{ct}); # ENTITY
2918     redo A;
2919     } else {
2920     !!!cp (218.7);
2921     !!!parse-error (type => 'string after SYSTEM literal');
2922     $self->{state} = BOGUS_MD_STATE;
2923     !!!next-input-character;
2924     redo A;
2925     }
2926 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2927     if ($self->{nc} == 0x003E) { # >
2928     !!!cp (219);
2929     $self->{state} = DATA_STATE;
2930 wakaba 1.5 $self->{s_kwd} = '';
2931 wakaba 1.1 !!!next-input-character;
2932    
2933     !!!emit ($self->{ct}); # DOCTYPE
2934    
2935     redo A;
2936 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937 wakaba 1.13 !!!cp (220.1);
2938     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2940     $self->{in_subset} = 1;
2941     !!!next-input-character;
2942     !!!emit ($self->{ct}); # DOCTYPE
2943     redo A;
2944 wakaba 1.1 } elsif ($self->{nc} == -1) {
2945     !!!cp (220);
2946     $self->{state} = DATA_STATE;
2947 wakaba 1.5 $self->{s_kwd} = '';
2948 wakaba 1.1 ## reconsume
2949    
2950     !!!emit ($self->{ct}); # DOCTYPE
2951    
2952     redo A;
2953     } else {
2954     !!!cp (221);
2955     my $s = '';
2956 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2957 wakaba 1.1
2958     ## Stay in the state
2959     !!!next-input-character;
2960     redo A;
2961     }
2962     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2963     ## NOTE: "CDATA section state" in the state is jointly implemented
2964     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2965     ## and |CDATA_SECTION_MSE2_STATE|.
2966 wakaba 1.10
2967     ## XML5: "CDATA state".
2968 wakaba 1.1
2969     if ($self->{nc} == 0x005D) { # ]
2970     !!!cp (221.1);
2971     $self->{state} = CDATA_SECTION_MSE1_STATE;
2972     !!!next-input-character;
2973     redo A;
2974     } elsif ($self->{nc} == -1) {
2975 wakaba 1.6 if ($self->{is_xml}) {
2976 wakaba 1.8 !!!cp (221.11);
2977 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2978 wakaba 1.8 } else {
2979     !!!cp (221.12);
2980 wakaba 1.6 }
2981    
2982 wakaba 1.1 $self->{state} = DATA_STATE;
2983 wakaba 1.5 $self->{s_kwd} = '';
2984 wakaba 1.10 ## Reconsume.
2985 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2986     !!!cp (221.2);
2987     !!!emit ($self->{ct}); # character
2988     } else {
2989     !!!cp (221.3);
2990     ## No token to emit. $self->{ct} is discarded.
2991     }
2992     redo A;
2993     } else {
2994     !!!cp (221.4);
2995     $self->{ct}->{data} .= chr $self->{nc};
2996     $self->{read_until}->($self->{ct}->{data},
2997     q<]>,
2998     length $self->{ct}->{data});
2999    
3000     ## Stay in the state.
3001     !!!next-input-character;
3002     redo A;
3003     }
3004    
3005     ## ISSUE: "text tokens" in spec.
3006     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3007 wakaba 1.10 ## XML5: "CDATA bracket state".
3008    
3009 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3010     !!!cp (221.5);
3011     $self->{state} = CDATA_SECTION_MSE2_STATE;
3012     !!!next-input-character;
3013     redo A;
3014     } else {
3015     !!!cp (221.6);
3016 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3017 wakaba 1.1 $self->{ct}->{data} .= ']';
3018 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3019 wakaba 1.1 ## Reconsume.
3020     redo A;
3021     }
3022     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3023 wakaba 1.10 ## XML5: "CDATA end state".
3024    
3025 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3026     $self->{state} = DATA_STATE;
3027 wakaba 1.5 $self->{s_kwd} = '';
3028 wakaba 1.1 !!!next-input-character;
3029     if (length $self->{ct}->{data}) { # character
3030     !!!cp (221.7);
3031     !!!emit ($self->{ct}); # character
3032     } else {
3033     !!!cp (221.8);
3034     ## No token to emit. $self->{ct} is discarded.
3035     }
3036     redo A;
3037     } elsif ($self->{nc} == 0x005D) { # ]
3038     !!!cp (221.9); # character
3039     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3040     ## Stay in the state.
3041     !!!next-input-character;
3042     redo A;
3043     } else {
3044     !!!cp (221.11);
3045     $self->{ct}->{data} .= ']]'; # character
3046     $self->{state} = CDATA_SECTION_STATE;
3047 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3048 wakaba 1.1 redo A;
3049     }
3050     } elsif ($self->{state} == ENTITY_STATE) {
3051     if ($is_space->{$self->{nc}} or
3052     {
3053     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3054     $self->{entity_add} => 1,
3055     }->{$self->{nc}}) {
3056     !!!cp (1001);
3057     ## Don't consume
3058     ## No error
3059     ## Return nothing.
3060     #
3061     } elsif ($self->{nc} == 0x0023) { # #
3062     !!!cp (999);
3063     $self->{state} = ENTITY_HASH_STATE;
3064 wakaba 1.12 $self->{kwd} = '#';
3065 wakaba 1.1 !!!next-input-character;
3066     redo A;
3067     } elsif ((0x0041 <= $self->{nc} and
3068     $self->{nc} <= 0x005A) or # A..Z
3069     (0x0061 <= $self->{nc} and
3070     $self->{nc} <= 0x007A)) { # a..z
3071     !!!cp (998);
3072     require Whatpm::_NamedEntityList;
3073     $self->{state} = ENTITY_NAME_STATE;
3074 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3075     $self->{entity__value} = $self->{kwd};
3076 wakaba 1.1 $self->{entity__match} = 0;
3077     !!!next-input-character;
3078     redo A;
3079     } else {
3080     !!!cp (1027);
3081     !!!parse-error (type => 'bare ero');
3082     ## Return nothing.
3083     #
3084     }
3085    
3086     ## NOTE: No character is consumed by the "consume a character
3087     ## reference" algorithm. In other word, there is an "&" character
3088     ## that does not introduce a character reference, which would be
3089     ## appended to the parent element or the attribute value in later
3090     ## process of the tokenizer.
3091    
3092     if ($self->{prev_state} == DATA_STATE) {
3093     !!!cp (997);
3094     $self->{state} = $self->{prev_state};
3095 wakaba 1.5 $self->{s_kwd} = '';
3096 wakaba 1.1 ## Reconsume.
3097     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3098     line => $self->{line_prev},
3099     column => $self->{column_prev},
3100     });
3101     redo A;
3102     } else {
3103     !!!cp (996);
3104     $self->{ca}->{value} .= '&';
3105     $self->{state} = $self->{prev_state};
3106 wakaba 1.5 $self->{s_kwd} = '';
3107 wakaba 1.1 ## Reconsume.
3108     redo A;
3109     }
3110     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3111     if ($self->{nc} == 0x0078 or # x
3112     $self->{nc} == 0x0058) { # X
3113     !!!cp (995);
3114     $self->{state} = HEXREF_X_STATE;
3115 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3116 wakaba 1.1 !!!next-input-character;
3117     redo A;
3118     } elsif (0x0030 <= $self->{nc} and
3119     $self->{nc} <= 0x0039) { # 0..9
3120     !!!cp (994);
3121     $self->{state} = NCR_NUM_STATE;
3122 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3123 wakaba 1.1 !!!next-input-character;
3124     redo A;
3125     } else {
3126     !!!parse-error (type => 'bare nero',
3127     line => $self->{line_prev},
3128     column => $self->{column_prev} - 1);
3129    
3130     ## NOTE: According to the spec algorithm, nothing is returned,
3131     ## and then "&#" is appended to the parent element or the attribute
3132     ## value in the later processing.
3133    
3134     if ($self->{prev_state} == DATA_STATE) {
3135     !!!cp (1019);
3136     $self->{state} = $self->{prev_state};
3137 wakaba 1.5 $self->{s_kwd} = '';
3138 wakaba 1.1 ## Reconsume.
3139     !!!emit ({type => CHARACTER_TOKEN,
3140     data => '&#',
3141     line => $self->{line_prev},
3142     column => $self->{column_prev} - 1,
3143     });
3144     redo A;
3145     } else {
3146     !!!cp (993);
3147     $self->{ca}->{value} .= '&#';
3148     $self->{state} = $self->{prev_state};
3149 wakaba 1.5 $self->{s_kwd} = '';
3150 wakaba 1.1 ## Reconsume.
3151     redo A;
3152     }
3153     }
3154     } elsif ($self->{state} == NCR_NUM_STATE) {
3155     if (0x0030 <= $self->{nc} and
3156     $self->{nc} <= 0x0039) { # 0..9
3157     !!!cp (1012);
3158 wakaba 1.12 $self->{kwd} *= 10;
3159     $self->{kwd} += $self->{nc} - 0x0030;
3160 wakaba 1.1
3161     ## Stay in the state.
3162     !!!next-input-character;
3163     redo A;
3164     } elsif ($self->{nc} == 0x003B) { # ;
3165     !!!cp (1013);
3166     !!!next-input-character;
3167     #
3168     } else {
3169     !!!cp (1014);
3170     !!!parse-error (type => 'no refc');
3171     ## Reconsume.
3172     #
3173     }
3174    
3175 wakaba 1.12 my $code = $self->{kwd};
3176 wakaba 1.1 my $l = $self->{line_prev};
3177     my $c = $self->{column_prev};
3178     if ($charref_map->{$code}) {
3179     !!!cp (1015);
3180     !!!parse-error (type => 'invalid character reference',
3181     text => (sprintf 'U+%04X', $code),
3182     line => $l, column => $c);
3183     $code = $charref_map->{$code};
3184     } elsif ($code > 0x10FFFF) {
3185     !!!cp (1016);
3186     !!!parse-error (type => 'invalid character reference',
3187     text => (sprintf 'U-%08X', $code),
3188     line => $l, column => $c);
3189     $code = 0xFFFD;
3190     }
3191    
3192     if ($self->{prev_state} == DATA_STATE) {
3193     !!!cp (992);
3194     $self->{state} = $self->{prev_state};
3195 wakaba 1.5 $self->{s_kwd} = '';
3196 wakaba 1.1 ## Reconsume.
3197     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3198 wakaba 1.7 has_reference => 1,
3199 wakaba 1.1 line => $l, column => $c,
3200     });
3201     redo A;
3202     } else {
3203     !!!cp (991);
3204     $self->{ca}->{value} .= chr $code;
3205     $self->{ca}->{has_reference} = 1;
3206     $self->{state} = $self->{prev_state};
3207 wakaba 1.5 $self->{s_kwd} = '';
3208 wakaba 1.1 ## Reconsume.
3209     redo A;
3210     }
3211     } elsif ($self->{state} == HEXREF_X_STATE) {
3212     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3213     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3214     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3215     # 0..9, A..F, a..f
3216     !!!cp (990);
3217     $self->{state} = HEXREF_HEX_STATE;
3218 wakaba 1.12 $self->{kwd} = 0;
3219 wakaba 1.1 ## Reconsume.
3220     redo A;
3221     } else {
3222     !!!parse-error (type => 'bare hcro',
3223     line => $self->{line_prev},
3224     column => $self->{column_prev} - 2);
3225    
3226     ## NOTE: According to the spec algorithm, nothing is returned,
3227     ## and then "&#" followed by "X" or "x" is appended to the parent
3228     ## element or the attribute value in the later processing.
3229    
3230     if ($self->{prev_state} == DATA_STATE) {
3231     !!!cp (1005);
3232     $self->{state} = $self->{prev_state};
3233 wakaba 1.5 $self->{s_kwd} = '';
3234 wakaba 1.1 ## Reconsume.
3235     !!!emit ({type => CHARACTER_TOKEN,
3236 wakaba 1.12 data => '&' . $self->{kwd},
3237 wakaba 1.1 line => $self->{line_prev},
3238 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3239 wakaba 1.1 });
3240     redo A;
3241     } else {
3242     !!!cp (989);
3243 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3244 wakaba 1.1 $self->{state} = $self->{prev_state};
3245 wakaba 1.5 $self->{s_kwd} = '';
3246 wakaba 1.1 ## Reconsume.
3247     redo A;
3248     }
3249     }
3250     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3251     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3252     # 0..9
3253     !!!cp (1002);
3254 wakaba 1.12 $self->{kwd} *= 0x10;
3255     $self->{kwd} += $self->{nc} - 0x0030;
3256 wakaba 1.1 ## Stay in the state.
3257     !!!next-input-character;
3258     redo A;
3259     } elsif (0x0061 <= $self->{nc} and
3260     $self->{nc} <= 0x0066) { # a..f
3261     !!!cp (1003);
3262 wakaba 1.12 $self->{kwd} *= 0x10;
3263     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3264 wakaba 1.1 ## Stay in the state.
3265     !!!next-input-character;
3266     redo A;
3267     } elsif (0x0041 <= $self->{nc} and
3268     $self->{nc} <= 0x0046) { # A..F
3269     !!!cp (1004);
3270 wakaba 1.12 $self->{kwd} *= 0x10;
3271     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3272 wakaba 1.1 ## Stay in the state.
3273     !!!next-input-character;
3274     redo A;
3275     } elsif ($self->{nc} == 0x003B) { # ;
3276     !!!cp (1006);
3277     !!!next-input-character;
3278     #
3279     } else {
3280     !!!cp (1007);
3281     !!!parse-error (type => 'no refc',
3282     line => $self->{line},
3283     column => $self->{column});
3284     ## Reconsume.
3285     #
3286     }
3287    
3288 wakaba 1.12 my $code = $self->{kwd};
3289 wakaba 1.1 my $l = $self->{line_prev};
3290     my $c = $self->{column_prev};
3291     if ($charref_map->{$code}) {
3292     !!!cp (1008);
3293     !!!parse-error (type => 'invalid character reference',
3294     text => (sprintf 'U+%04X', $code),
3295     line => $l, column => $c);
3296     $code = $charref_map->{$code};
3297     } elsif ($code > 0x10FFFF) {
3298     !!!cp (1009);
3299     !!!parse-error (type => 'invalid character reference',
3300     text => (sprintf 'U-%08X', $code),
3301     line => $l, column => $c);
3302     $code = 0xFFFD;
3303     }
3304    
3305     if ($self->{prev_state} == DATA_STATE) {
3306     !!!cp (988);
3307     $self->{state} = $self->{prev_state};
3308 wakaba 1.5 $self->{s_kwd} = '';
3309 wakaba 1.1 ## Reconsume.
3310     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3311 wakaba 1.7 has_reference => 1,
3312 wakaba 1.1 line => $l, column => $c,
3313     });
3314     redo A;
3315     } else {
3316     !!!cp (987);
3317     $self->{ca}->{value} .= chr $code;
3318     $self->{ca}->{has_reference} = 1;
3319     $self->{state} = $self->{prev_state};
3320 wakaba 1.5 $self->{s_kwd} = '';
3321 wakaba 1.1 ## Reconsume.
3322     redo A;
3323     }
3324     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3325 wakaba 1.12 if (length $self->{kwd} < 30 and
3326 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3327     ((0x0041 <= $self->{nc} and # a
3328     $self->{nc} <= 0x005A) or # x
3329     (0x0061 <= $self->{nc} and # a
3330     $self->{nc} <= 0x007A) or # z
3331     (0x0030 <= $self->{nc} and # 0
3332     $self->{nc} <= 0x0039) or # 9
3333     $self->{nc} == 0x003B)) { # ;
3334     our $EntityChar;
3335 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3336     if (defined $EntityChar->{$self->{kwd}}) {
3337 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3338     !!!cp (1020);
3339 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3340 wakaba 1.1 $self->{entity__match} = 1;
3341     !!!next-input-character;
3342     #
3343     } else {
3344     !!!cp (1021);
3345 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3346 wakaba 1.1 $self->{entity__match} = -1;
3347     ## Stay in the state.
3348     !!!next-input-character;
3349     redo A;
3350     }
3351     } else {
3352     !!!cp (1022);
3353     $self->{entity__value} .= chr $self->{nc};
3354     $self->{entity__match} *= 2;
3355     ## Stay in the state.
3356     !!!next-input-character;
3357     redo A;
3358     }
3359     }
3360    
3361     my $data;
3362     my $has_ref;
3363     if ($self->{entity__match} > 0) {
3364     !!!cp (1023);
3365     $data = $self->{entity__value};
3366     $has_ref = 1;
3367     #
3368     } elsif ($self->{entity__match} < 0) {
3369     !!!parse-error (type => 'no refc');
3370     if ($self->{prev_state} != DATA_STATE and # in attribute
3371     $self->{entity__match} < -1) {
3372     !!!cp (1024);
3373 wakaba 1.12 $data = '&' . $self->{kwd};
3374 wakaba 1.1 #
3375     } else {
3376     !!!cp (1025);
3377     $data = $self->{entity__value};
3378     $has_ref = 1;
3379     #
3380     }
3381     } else {
3382     !!!cp (1026);
3383     !!!parse-error (type => 'bare ero',
3384     line => $self->{line_prev},
3385 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3386     $data = '&' . $self->{kwd};
3387 wakaba 1.1 #
3388     }
3389    
3390     ## NOTE: In these cases, when a character reference is found,
3391     ## it is consumed and a character token is returned, or, otherwise,
3392     ## nothing is consumed and returned, according to the spec algorithm.
3393     ## In this implementation, anything that has been examined by the
3394     ## tokenizer is appended to the parent element or the attribute value
3395     ## as string, either literal string when no character reference or
3396     ## entity-replaced string otherwise, in this stage, since any characters
3397     ## that would not be consumed are appended in the data state or in an
3398     ## appropriate attribute value state anyway.
3399    
3400     if ($self->{prev_state} == DATA_STATE) {
3401     !!!cp (986);
3402     $self->{state} = $self->{prev_state};
3403 wakaba 1.5 $self->{s_kwd} = '';
3404 wakaba 1.1 ## Reconsume.
3405     !!!emit ({type => CHARACTER_TOKEN,
3406     data => $data,
3407 wakaba 1.7 has_reference => $has_ref,
3408 wakaba 1.1 line => $self->{line_prev},
3409 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3410 wakaba 1.1 });
3411     redo A;
3412     } else {
3413     !!!cp (985);
3414     $self->{ca}->{value} .= $data;
3415     $self->{ca}->{has_reference} = 1 if $has_ref;
3416     $self->{state} = $self->{prev_state};
3417 wakaba 1.5 $self->{s_kwd} = '';
3418 wakaba 1.1 ## Reconsume.
3419     redo A;
3420     }
3421 wakaba 1.8
3422     ## XML-only states
3423    
3424     } elsif ($self->{state} == PI_STATE) {
3425 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3426    
3427 wakaba 1.8 if ($is_space->{$self->{nc}} or
3428 wakaba 1.14 $self->{nc} == 0x003F or # ?
3429 wakaba 1.8 $self->{nc} == -1) {
3430 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3431     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3432     ## "DOCTYPE pi state": Parse error, switch to the "data
3433     ## state".
3434 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3435     line => $self->{line_prev},
3436     column => $self->{column_prev}
3437     - 1 * ($self->{nc} != -1));
3438     $self->{state} = BOGUS_COMMENT_STATE;
3439     ## Reconsume.
3440     $self->{ct} = {type => COMMENT_TOKEN,
3441     data => '?',
3442     line => $self->{line_prev},
3443     column => $self->{column_prev}
3444     - 1 * ($self->{nc} != -1),
3445     };
3446     redo A;
3447     } else {
3448 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3449 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3450     target => chr $self->{nc},
3451     data => '',
3452     line => $self->{line_prev},
3453     column => $self->{column_prev} - 1,
3454     };
3455     $self->{state} = PI_TARGET_STATE;
3456     !!!next-input-character;
3457     redo A;
3458     }
3459     } elsif ($self->{state} == PI_TARGET_STATE) {
3460     if ($is_space->{$self->{nc}}) {
3461     $self->{state} = PI_TARGET_AFTER_STATE;
3462     !!!next-input-character;
3463     redo A;
3464     } elsif ($self->{nc} == -1) {
3465     !!!parse-error (type => 'no pic'); ## TODO: type
3466 wakaba 1.13 if ($self->{in_subset}) {
3467     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3468     } else {
3469     $self->{state} = DATA_STATE;
3470     $self->{s_kwd} = '';
3471     }
3472 wakaba 1.8 ## Reconsume.
3473     !!!emit ($self->{ct}); # pi
3474     redo A;
3475     } elsif ($self->{nc} == 0x003F) { # ?
3476     $self->{state} = PI_AFTER_STATE;
3477     !!!next-input-character;
3478     redo A;
3479     } else {
3480     ## XML5: typo ("tag name" -> "target")
3481     $self->{ct}->{target} .= chr $self->{nc}; # pi
3482     !!!next-input-character;
3483     redo A;
3484     }
3485     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3486     if ($is_space->{$self->{nc}}) {
3487     ## Stay in the state.
3488     !!!next-input-character;
3489     redo A;
3490     } else {
3491     $self->{state} = PI_DATA_STATE;
3492     ## Reprocess.
3493     redo A;
3494     }
3495     } elsif ($self->{state} == PI_DATA_STATE) {
3496     if ($self->{nc} == 0x003F) { # ?
3497     $self->{state} = PI_DATA_AFTER_STATE;
3498     !!!next-input-character;
3499     redo A;
3500     } elsif ($self->{nc} == -1) {
3501     !!!parse-error (type => 'no pic'); ## TODO: type
3502 wakaba 1.13 if ($self->{in_subset}) {
3503 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3504 wakaba 1.13 } else {
3505     $self->{state} = DATA_STATE;
3506     $self->{s_kwd} = '';
3507     }
3508 wakaba 1.8 ## Reprocess.
3509     !!!emit ($self->{ct}); # pi
3510     redo A;
3511     } else {
3512     $self->{ct}->{data} .= chr $self->{nc}; # pi
3513     $self->{read_until}->($self->{ct}->{data}, q[?],
3514     length $self->{ct}->{data});
3515     ## Stay in the state.
3516     !!!next-input-character;
3517     ## Reprocess.
3518     redo A;
3519     }
3520     } elsif ($self->{state} == PI_AFTER_STATE) {
3521 wakaba 1.14 ## XML5: Part of "Pi after state".
3522    
3523 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3524 wakaba 1.13 if ($self->{in_subset}) {
3525     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3526     } else {
3527     $self->{state} = DATA_STATE;
3528     $self->{s_kwd} = '';
3529     }
3530 wakaba 1.8 !!!next-input-character;
3531     !!!emit ($self->{ct}); # pi
3532     redo A;
3533     } elsif ($self->{nc} == 0x003F) { # ?
3534     !!!parse-error (type => 'no s after target', ## TODO: type
3535     line => $self->{line_prev},
3536     column => $self->{column_prev}); ## XML5: no error
3537     $self->{ct}->{data} .= '?';
3538     $self->{state} = PI_DATA_AFTER_STATE;
3539     !!!next-input-character;
3540     redo A;
3541     } else {
3542     !!!parse-error (type => 'no s after target', ## TODO: type
3543     line => $self->{line_prev},
3544     column => $self->{column_prev}
3545     + 1 * ($self->{nc} == -1)); ## XML5: no error
3546     $self->{ct}->{data} .= '?'; ## XML5: not appended
3547     $self->{state} = PI_DATA_STATE;
3548     ## Reprocess.
3549     redo A;
3550     }
3551     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3552 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3553    
3554 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3555 wakaba 1.13 if ($self->{in_subset}) {
3556     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3557     } else {
3558     $self->{state} = DATA_STATE;
3559     $self->{s_kwd} = '';
3560     }
3561 wakaba 1.8 !!!next-input-character;
3562     !!!emit ($self->{ct}); # pi
3563     redo A;
3564     } elsif ($self->{nc} == 0x003F) { # ?
3565     $self->{ct}->{data} .= '?';
3566     ## Stay in the state.
3567     !!!next-input-character;
3568     redo A;
3569     } else {
3570     $self->{ct}->{data} .= '?'; ## XML5: not appended
3571     $self->{state} = PI_DATA_STATE;
3572     ## Reprocess.
3573     redo A;
3574     }
3575 wakaba 1.12
3576     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3577     if ($self->{nc} == 0x003C) { # <
3578 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3579 wakaba 1.12 !!!next-input-character;
3580     redo A;
3581     } elsif ($self->{nc} == 0x0025) { # %
3582     ## XML5: Not defined yet.
3583    
3584     ## TODO:
3585     !!!next-input-character;
3586     redo A;
3587     } elsif ($self->{nc} == 0x005D) { # ]
3588 wakaba 1.13 delete $self->{in_subset};
3589 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3590     !!!next-input-character;
3591     redo A;
3592     } elsif ($is_space->{$self->{nc}}) {
3593     ## Stay in the state.
3594     !!!next-input-character;
3595     redo A;
3596     } elsif ($self->{nc} == -1) {
3597     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3598 wakaba 1.13 delete $self->{in_subset};
3599 wakaba 1.12 $self->{state} = DATA_STATE;
3600     $self->{s_kwd} = '';
3601     ## Reconsume.
3602 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3603 wakaba 1.12 redo A;
3604     } else {
3605     unless ($self->{internal_subset_tainted}) {
3606     ## XML5: No parse error.
3607     !!!parse-error (type => 'string in internal subset');
3608     $self->{internal_subset_tainted} = 1;
3609     }
3610     ## Stay in the state.
3611     !!!next-input-character;
3612     redo A;
3613     }
3614     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3615     if ($self->{nc} == 0x003E) { # >
3616     $self->{state} = DATA_STATE;
3617     $self->{s_kwd} = '';
3618     !!!next-input-character;
3619 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3620 wakaba 1.12 redo A;
3621     } elsif ($self->{nc} == -1) {
3622     !!!parse-error (type => 'unclosed DOCTYPE');
3623     $self->{state} = DATA_STATE;
3624     $self->{s_kwd} = '';
3625     ## Reconsume.
3626 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3627 wakaba 1.12 redo A;
3628     } else {
3629     ## XML5: No parse error and stay in the state.
3630     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3631    
3632 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3633     !!!next-input-character;
3634     redo A;
3635     }
3636     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3637     if ($self->{nc} == 0x003E) { # >
3638     $self->{state} = DATA_STATE;
3639     $self->{s_kwd} = '';
3640     !!!next-input-character;
3641     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3642     redo A;
3643     } elsif ($self->{nc} == -1) {
3644     $self->{state} = DATA_STATE;
3645     $self->{s_kwd} = '';
3646     ## Reconsume.
3647     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3648     redo A;
3649     } else {
3650     ## Stay in the state.
3651     !!!next-input-character;
3652     redo A;
3653     }
3654     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3655     if ($self->{nc} == 0x0021) { # !
3656 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3657 wakaba 1.13 !!!next-input-character;
3658     redo A;
3659     } elsif ($self->{nc} == 0x003F) { # ?
3660     $self->{state} = PI_STATE;
3661     !!!next-input-character;
3662     redo A;
3663     } elsif ($self->{nc} == -1) {
3664     !!!parse-error (type => 'bare stago');
3665     $self->{state} = DATA_STATE;
3666     $self->{s_kwd} = '';
3667     ## Reconsume.
3668     redo A;
3669     } else {
3670     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3671     line => $self->{line_prev},
3672     column => $self->{column_prev});
3673     $self->{state} = BOGUS_COMMENT_STATE;
3674     $self->{ct} = {type => COMMENT_TOKEN,
3675     data => '',
3676     }; ## NOTE: Will be discarded.
3677 wakaba 1.12 !!!next-input-character;
3678     redo A;
3679     }
3680 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3681     ## XML5: "DOCTYPE markup declaration state".
3682    
3683     if ($self->{nc} == 0x002D) { # -
3684     $self->{state} = MD_HYPHEN_STATE;
3685     !!!next-input-character;
3686     redo A;
3687 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3688     $self->{nc} == 0x0065) { # e
3689 wakaba 1.14 $self->{state} = MD_E_STATE;
3690     $self->{kwd} = chr $self->{nc};
3691     !!!next-input-character;
3692     redo A;
3693 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3694     $self->{nc} == 0x0061) { # a
3695 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3696     $self->{kwd} = chr $self->{nc};
3697     !!!next-input-character;
3698     redo A;
3699 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3700     $self->{nc} == 0x006E) { # n
3701 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3702     $self->{kwd} = chr $self->{nc};
3703     !!!next-input-character;
3704     redo A;
3705     } else {
3706     #
3707     }
3708    
3709     ## XML5: No parse error.
3710     !!!parse-error (type => 'bogus comment',
3711     line => $self->{line_prev},
3712     column => $self->{column_prev} - 1);
3713     ## Reconsume.
3714     $self->{state} = BOGUS_COMMENT_STATE;
3715     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3716     redo A;
3717     } elsif ($self->{state} == MD_E_STATE) {
3718 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3719     $self->{nc} == 0x006E) { # n
3720 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3721     $self->{kwd} .= chr $self->{nc};
3722     !!!next-input-character;
3723     redo A;
3724 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3725     $self->{nc} == 0x006C) { # l
3726 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3727     $self->{state} = MD_ELEMENT_STATE;
3728     $self->{kwd} .= chr $self->{nc};
3729     !!!next-input-character;
3730     redo A;
3731     } else {
3732     ## XML5: No parse error.
3733     !!!parse-error (type => 'bogus comment',
3734     line => $self->{line_prev},
3735     column => $self->{column_prev} - 2
3736     + 1 * ($self->{nc} == -1));
3737     ## Reconsume.
3738     $self->{state} = BOGUS_COMMENT_STATE;
3739     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3740     redo A;
3741     }
3742     } elsif ($self->{state} == MD_ENTITY_STATE) {
3743 wakaba 1.17 if ($self->{nc} == [
3744     undef,
3745     undef,
3746     0x0054, # T
3747     0x0049, # I
3748     0x0054, # T
3749     ]->[length $self->{kwd}] or
3750     $self->{nc} == [
3751     undef,
3752     undef,
3753     0x0074, # t
3754     0x0069, # i
3755     0x0074, # t
3756     ]->[length $self->{kwd}]) {
3757 wakaba 1.14 ## Stay in the state.
3758     $self->{kwd} .= chr $self->{nc};
3759     !!!next-input-character;
3760     redo A;
3761 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3762     ($self->{nc} == 0x0059 or # Y
3763     $self->{nc} == 0x0079)) { # y
3764     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3765     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3766     text => 'ENTITY',
3767     line => $self->{line_prev},
3768     column => $self->{column_prev} - 4);
3769     }
3770     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3771 wakaba 1.14 line => $self->{line_prev},
3772     column => $self->{column_prev} - 6};
3773     $self->{state} = DOCTYPE_MD_STATE;
3774     !!!next-input-character;
3775     redo A;
3776     } else {
3777     !!!parse-error (type => 'bogus comment',
3778     line => $self->{line_prev},
3779     column => $self->{column_prev} - 1
3780     - (length $self->{kwd})
3781     + 1 * ($self->{nc} == -1));
3782     $self->{state} = BOGUS_COMMENT_STATE;
3783     ## Reconsume.
3784     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3785     redo A;
3786     }
3787     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3788 wakaba 1.17 if ($self->{nc} == [
3789     undef,
3790     undef,
3791     0x0045, # E
3792     0x004D, # M
3793     0x0045, # E
3794     0x004E, # N
3795     ]->[length $self->{kwd}] or
3796     $self->{nc} == [
3797     undef,
3798     undef,
3799     0x0065, # e
3800     0x006D, # m
3801     0x0065, # e
3802     0x006E, # n
3803     ]->[length $self->{kwd}]) {
3804 wakaba 1.14 ## Stay in the state.
3805     $self->{kwd} .= chr $self->{nc};
3806     !!!next-input-character;
3807     redo A;
3808 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3809     ($self->{nc} == 0x0054 or # T
3810     $self->{nc} == 0x0074)) { # t
3811     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3812     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3813     text => 'ELEMENT',
3814     line => $self->{line_prev},
3815     column => $self->{column_prev} - 5);
3816     }
3817 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3818     line => $self->{line_prev},
3819     column => $self->{column_prev} - 6};
3820     $self->{state} = DOCTYPE_MD_STATE;
3821     !!!next-input-character;
3822     redo A;
3823     } else {
3824     !!!parse-error (type => 'bogus comment',
3825     line => $self->{line_prev},
3826     column => $self->{column_prev} - 1
3827     - (length $self->{kwd})
3828     + 1 * ($self->{nc} == -1));
3829     $self->{state} = BOGUS_COMMENT_STATE;
3830     ## Reconsume.
3831     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3832     redo A;
3833     }
3834     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3835 wakaba 1.17 if ($self->{nc} == [
3836     undef,
3837     0x0054, # T
3838     0x0054, # T
3839     0x004C, # L
3840     0x0049, # I
3841     0x0053, # S
3842     ]->[length $self->{kwd}] or
3843     $self->{nc} == [
3844     undef,
3845     0x0074, # t
3846     0x0074, # t
3847     0x006C, # l
3848     0x0069, # i
3849     0x0073, # s
3850     ]->[length $self->{kwd}]) {
3851 wakaba 1.14 ## Stay in the state.
3852     $self->{kwd} .= chr $self->{nc};
3853     !!!next-input-character;
3854     redo A;
3855 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3856     ($self->{nc} == 0x0054 or # T
3857     $self->{nc} == 0x0074)) { # t
3858     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3859     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3860     text => 'ATTLIST',
3861     line => $self->{line_prev},
3862     column => $self->{column_prev} - 5);
3863     }
3864 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3865 wakaba 1.15 attrdefs => [],
3866 wakaba 1.14 line => $self->{line_prev},
3867     column => $self->{column_prev} - 6};
3868     $self->{state} = DOCTYPE_MD_STATE;
3869     !!!next-input-character;
3870     redo A;
3871     } else {
3872     !!!parse-error (type => 'bogus comment',
3873     line => $self->{line_prev},
3874     column => $self->{column_prev} - 1
3875     - (length $self->{kwd})
3876     + 1 * ($self->{nc} == -1));
3877     $self->{state} = BOGUS_COMMENT_STATE;
3878     ## Reconsume.
3879     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3880     redo A;
3881     }
3882     } elsif ($self->{state} == MD_NOTATION_STATE) {
3883 wakaba 1.17 if ($self->{nc} == [
3884     undef,
3885     0x004F, # O
3886     0x0054, # T
3887     0x0041, # A
3888     0x0054, # T
3889     0x0049, # I
3890     0x004F, # O
3891     ]->[length $self->{kwd}] or
3892     $self->{nc} == [
3893     undef,
3894     0x006F, # o
3895     0x0074, # t
3896     0x0061, # a
3897     0x0074, # t
3898     0x0069, # i
3899     0x006F, # o
3900     ]->[length $self->{kwd}]) {
3901 wakaba 1.14 ## Stay in the state.
3902     $self->{kwd} .= chr $self->{nc};
3903     !!!next-input-character;
3904     redo A;
3905 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
3906     ($self->{nc} == 0x004E or # N
3907     $self->{nc} == 0x006E)) { # n
3908     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3909     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3910     text => 'NOTATION',
3911     line => $self->{line_prev},
3912     column => $self->{column_prev} - 6);
3913     }
3914 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3915     line => $self->{line_prev},
3916     column => $self->{column_prev} - 6};
3917     $self->{state} = DOCTYPE_MD_STATE;
3918     !!!next-input-character;
3919     redo A;
3920     } else {
3921     !!!parse-error (type => 'bogus comment',
3922     line => $self->{line_prev},
3923     column => $self->{column_prev} - 1
3924     - (length $self->{kwd})
3925     + 1 * ($self->{nc} == -1));
3926     $self->{state} = BOGUS_COMMENT_STATE;
3927     ## Reconsume.
3928     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3929     redo A;
3930     }
3931     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3932     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3933     ## "DOCTYPE NOTATION state".
3934    
3935     if ($is_space->{$self->{nc}}) {
3936     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3937     $self->{state} = BEFORE_MD_NAME_STATE;
3938     !!!next-input-character;
3939     redo A;
3940     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3941     $self->{nc} == 0x0025) { # %
3942     ## XML5: Switch to the "DOCTYPE bogus comment state".
3943     !!!parse-error (type => 'no space before md name'); ## TODO: type
3944     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3945     !!!next-input-character;
3946     redo A;
3947     } elsif ($self->{nc} == -1) {
3948     !!!parse-error (type => 'unclosed md'); ## TODO: type
3949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3950     ## Reconsume.
3951     redo A;
3952     } elsif ($self->{nc} == 0x003E) { # >
3953     ## XML5: Switch to the "DOCTYPE bogus comment state".
3954     !!!parse-error (type => 'no md name'); ## TODO: type
3955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3956     !!!next-input-character;
3957     redo A;
3958     } else {
3959     ## XML5: Switch to the "DOCTYPE bogus comment state".
3960     !!!parse-error (type => 'no space before md name'); ## TODO: type
3961     $self->{state} = BEFORE_MD_NAME_STATE;
3962     redo A;
3963     }
3964     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3965     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3966     ## before state", "DOCTYPE ATTLIST name before state".
3967    
3968     if ($is_space->{$self->{nc}}) {
3969     ## Stay in the state.
3970     !!!next-input-character;
3971     redo A;
3972     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3973     $self->{nc} == 0x0025) { # %
3974     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3975     !!!next-input-character;
3976     redo A;
3977     } elsif ($self->{nc} == 0x003E) { # >
3978     ## XML5: Same as "Anything else".
3979     !!!parse-error (type => 'no md name'); ## TODO: type
3980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981     !!!next-input-character;
3982     redo A;
3983     } elsif ($self->{nc} == -1) {
3984     !!!parse-error (type => 'unclosed md'); ## TODO: type
3985     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3986     ## Reconsume.
3987     redo A;
3988     } else {
3989     ## XML5: [ATTLIST] Not defined yet.
3990     $self->{ct}->{name} .= chr $self->{nc};
3991     $self->{state} = MD_NAME_STATE;
3992     !!!next-input-character;
3993     redo A;
3994     }
3995     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3996     if ($is_space->{$self->{nc}}) {
3997     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3998     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3999     $self->{state} = BEFORE_MD_NAME_STATE;
4000     !!!next-input-character;
4001     redo A;
4002     } elsif ($self->{nc} == 0x003E) { # >
4003     ## XML5: Same as "Anything else".
4004     !!!parse-error (type => 'no md name'); ## TODO: type
4005     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4006     !!!next-input-character;
4007     redo A;
4008     } elsif ($self->{nc} == -1) {
4009     !!!parse-error (type => 'unclosed md');
4010     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4011     ## Reconsume.
4012     redo A;
4013     } else {
4014     ## XML5: No parse error.
4015     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4016     $self->{state} = BOGUS_COMMENT_STATE;
4017     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4018     ## Reconsume.
4019     redo A;
4020     }
4021     } elsif ($self->{state} == MD_NAME_STATE) {
4022     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4023    
4024     if ($is_space->{$self->{nc}}) {
4025 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4026     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4027     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4028     ## TODO: ...
4029     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4030     } else { # ENTITY/NOTATION
4031     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4032     }
4033 wakaba 1.14 !!!next-input-character;
4034     redo A;
4035     } elsif ($self->{nc} == 0x003E) { # >
4036     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4037     #
4038     } else {
4039 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4040 wakaba 1.14 }
4041     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4042     !!!next-input-character;
4043     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4044     redo A;
4045     } elsif ($self->{nc} == -1) {
4046     ## XML5: [ATTLIST] No parse error.
4047     !!!parse-error (type => 'unclosed md');
4048     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4049     ## Reconsume.
4050     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4051     redo A;
4052     } else {
4053     ## XML5: [ATTLIST] Not defined yet.
4054     $self->{ct}->{name} .= chr $self->{nc};
4055     ## Stay in the state.
4056     !!!next-input-character;
4057     redo A;
4058     }
4059     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4060     if ($is_space->{$self->{nc}}) {
4061     ## Stay in the state.
4062     !!!next-input-character;
4063     redo A;
4064     } elsif ($self->{nc} == 0x003E) { # >
4065     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066     !!!next-input-character;
4067     !!!emit ($self->{ct}); # ATTLIST
4068     redo A;
4069     } elsif ($self->{nc} == -1) {
4070     ## XML5: No parse error.
4071     !!!parse-error (type => 'unclosed md'); ## TODO: type
4072     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4073 wakaba 1.15 !!!emit ($self->{ct});
4074     redo A;
4075     } else {
4076     ## XML5: Not defined yet.
4077     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4078     tokens => [],
4079     line => $self->{line}, column => $self->{column}};
4080     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4081     !!!next-input-character;
4082     redo A;
4083     }
4084     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4085     if ($is_space->{$self->{nc}}) {
4086     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4087     !!!next-input-character;
4088     redo A;
4089     } elsif ($self->{nc} == 0x003E) { # >
4090     ## XML5: Same as "anything else".
4091     !!!parse-error (type => 'no attr type'); ## TODO: type
4092     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4093     !!!next-input-character;
4094     !!!emit ($self->{ct}); # ATTLIST
4095     redo A;
4096     } elsif ($self->{nc} == 0x0028) { # (
4097     ## XML5: Same as "anything else".
4098     !!!parse-error (type => 'no space before paren'); ## TODO: type
4099     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4100     !!!next-input-character;
4101     redo A;
4102     } elsif ($self->{nc} == -1) {
4103     ## XML5: No parse error.
4104     !!!parse-error (type => 'unclosed md'); ## TODO: type
4105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4106     !!!next-input-character;
4107     !!!emit ($self->{ct}); # ATTLIST
4108     redo A;
4109     } else {
4110     ## XML5: Not defined yet.
4111     $self->{ca}->{name} .= chr $self->{nc};
4112     ## Stay in the state.
4113     !!!next-input-character;
4114     redo A;
4115     }
4116     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4117     if ($is_space->{$self->{nc}}) {
4118     ## Stay in the state.
4119     !!!next-input-character;
4120     redo A;
4121     } elsif ($self->{nc} == 0x003E) { # >
4122     ## XML5: Same as "anything else".
4123     !!!parse-error (type => 'no attr type'); ## TODO: type
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125     !!!next-input-character;
4126     !!!emit ($self->{ct}); # ATTLIST
4127     redo A;
4128     } elsif ($self->{nc} == 0x0028) { # (
4129     ## XML5: Same as "anything else".
4130     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4131     !!!next-input-character;
4132     redo A;
4133     } elsif ($self->{nc} == -1) {
4134     ## XML5: No parse error.
4135     !!!parse-error (type => 'unclosed md'); ## TODO: type
4136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137     !!!next-input-character;
4138     !!!emit ($self->{ct});
4139 wakaba 1.14 redo A;
4140     } else {
4141     ## XML5: Not defined yet.
4142 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4143     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4144     !!!next-input-character;
4145     redo A;
4146     }
4147     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4148     if ($is_space->{$self->{nc}}) {
4149     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4150     !!!next-input-character;
4151     redo A;
4152     } elsif ($self->{nc} == 0x0023) { # #
4153     ## XML5: Same as "anything else".
4154     !!!parse-error (type => 'no space before default value'); ## TODO: type
4155     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4156     !!!next-input-character;
4157     redo A;
4158     } elsif ($self->{nc} == 0x0022) { # "
4159     ## XML5: Same as "anything else".
4160     !!!parse-error (type => 'no space before default value'); ## TODO: type
4161     $self->{ca}->{value} = '';
4162     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4163     !!!next-input-character;
4164     redo A;
4165     } elsif ($self->{nc} == 0x0027) { # '
4166     ## XML5: Same as "anything else".
4167     !!!parse-error (type => 'no space before default value'); ## TODO: type
4168     $self->{ca}->{value} = '';
4169     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4170     !!!next-input-character;
4171     redo A;
4172     } elsif ($self->{nc} == 0x003E) { # >
4173     ## XML5: Same as "anything else".
4174     !!!parse-error (type => 'no attr default'); ## TODO: type
4175     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4176     !!!next-input-character;
4177     !!!emit ($self->{ct}); # ATTLIST
4178     redo A;
4179     } elsif ($self->{nc} == 0x0028) { # (
4180     ## XML5: Same as "anything else".
4181     !!!parse-error (type => 'no space before paren'); ## TODO: type
4182     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4183     !!!next-input-character;
4184     redo A;
4185     } elsif ($self->{nc} == -1) {
4186     ## XML5: No parse error.
4187     !!!parse-error (type => 'unclosed md'); ## TODO: type
4188     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4189     !!!next-input-character;
4190     !!!emit ($self->{ct});
4191     redo A;
4192     } else {
4193     ## XML5: Not defined yet.
4194     $self->{ca}->{type} .= chr $self->{nc};
4195     ## Stay in the state.
4196     !!!next-input-character;
4197     redo A;
4198     }
4199     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4200     if ($is_space->{$self->{nc}}) {
4201     ## Stay in the state.
4202     !!!next-input-character;
4203     redo A;
4204     } elsif ($self->{nc} == 0x0028) { # (
4205     ## XML5: Same as "anything else".
4206     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4207     !!!next-input-character;
4208     redo A;
4209     } elsif ($self->{nc} == 0x0023) { # #
4210     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4211     !!!next-input-character;
4212     redo A;
4213     } elsif ($self->{nc} == 0x0022) { # "
4214     ## XML5: Same as "anything else".
4215     $self->{ca}->{value} = '';
4216     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4217     !!!next-input-character;
4218     redo A;
4219     } elsif ($self->{nc} == 0x0027) { # '
4220     ## XML5: Same as "anything else".
4221     $self->{ca}->{value} = '';
4222     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4223     !!!next-input-character;
4224     redo A;
4225     } elsif ($self->{nc} == 0x003E) { # >
4226     ## XML5: Same as "anything else".
4227     !!!parse-error (type => 'no attr default'); ## TODO: type
4228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4229     !!!next-input-character;
4230     !!!emit ($self->{ct}); # ATTLIST
4231     redo A;
4232     } elsif ($self->{nc} == -1) {
4233     ## XML5: No parse error.
4234     !!!parse-error (type => 'unclosed md'); ## TODO: type
4235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4236     !!!next-input-character;
4237     !!!emit ($self->{ct});
4238     redo A;
4239     } else {
4240     ## XML5: Switch to the "DOCTYPE bogus comment state".
4241     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4242     $self->{ca}->{value} = '';
4243     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4244     ## Reconsume.
4245     redo A;
4246     }
4247     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4248     if ($is_space->{$self->{nc}}) {
4249     ## Stay in the state.
4250     !!!next-input-character;
4251     redo A;
4252     } elsif ($self->{nc} == 0x007C) { # |
4253     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4254     ## Stay in the state.
4255     !!!next-input-character;
4256     redo A;
4257     } elsif ($self->{nc} == 0x0029) { # )
4258     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4259     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4260     !!!next-input-character;
4261     redo A;
4262     } elsif ($self->{nc} == 0x003E) { # >
4263     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4265     !!!next-input-character;
4266     !!!emit ($self->{ct}); # ATTLIST
4267     redo A;
4268     } elsif ($self->{nc} == -1) {
4269     ## XML5: No parse error.
4270     !!!parse-error (type => 'unclosed md'); ## TODO: type
4271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4272     !!!next-input-character;
4273     !!!emit ($self->{ct});
4274     redo A;
4275     } else {
4276     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4277     $self->{state} = ALLOWED_TOKEN_STATE;
4278     !!!next-input-character;
4279     redo A;
4280     }
4281     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4282     if ($is_space->{$self->{nc}}) {
4283     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4284     !!!next-input-character;
4285     redo A;
4286     } elsif ($self->{nc} == 0x007C) { # |
4287     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4288     !!!next-input-character;
4289     redo A;
4290     } elsif ($self->{nc} == 0x0029) { # )
4291     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4292     !!!next-input-character;
4293     redo A;
4294     } elsif ($self->{nc} == 0x003E) { # >
4295     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4296     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297     !!!next-input-character;
4298     !!!emit ($self->{ct}); # ATTLIST
4299     redo A;
4300     } elsif ($self->{nc} == -1) {
4301     ## XML5: No parse error.
4302     !!!parse-error (type => 'unclosed md'); ## TODO: type
4303     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4304     !!!next-input-character;
4305     !!!emit ($self->{ct});
4306     redo A;
4307     } else {
4308     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4309     ## Stay in the state.
4310     !!!next-input-character;
4311     redo A;
4312     }
4313     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4314     if ($is_space->{$self->{nc}}) {
4315     ## Stay in the state.
4316     !!!next-input-character;
4317     redo A;
4318     } elsif ($self->{nc} == 0x007C) { # |
4319     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4320     !!!next-input-character;
4321     redo A;
4322     } elsif ($self->{nc} == 0x0029) { # )
4323     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4324     !!!next-input-character;
4325     redo A;
4326     } elsif ($self->{nc} == 0x003E) { # >
4327     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4328     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4329     !!!next-input-character;
4330     !!!emit ($self->{ct}); # ATTLIST
4331     redo A;
4332     } elsif ($self->{nc} == -1) {
4333     ## XML5: No parse error.
4334     !!!parse-error (type => 'unclosed md'); ## TODO: type
4335     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4336     !!!next-input-character;
4337     !!!emit ($self->{ct});
4338     redo A;
4339     } else {
4340     !!!parse-error (type => 'space in allowed token', ## TODO: type
4341     line => $self->{line_prev},
4342     column => $self->{column_prev});
4343     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4344     $self->{state} = ALLOWED_TOKEN_STATE;
4345     !!!next-input-character;
4346     redo A;
4347     }
4348     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4349     if ($is_space->{$self->{nc}}) {
4350     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4351     !!!next-input-character;
4352     redo A;
4353     } elsif ($self->{nc} == 0x0023) { # #
4354     !!!parse-error (type => 'no space before default value'); ## TODO: type
4355     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4356     !!!next-input-character;
4357     redo A;
4358     } elsif ($self->{nc} == 0x0022) { # "
4359     !!!parse-error (type => 'no space before default value'); ## TODO: type
4360     $self->{ca}->{value} = '';
4361     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4362     !!!next-input-character;
4363     redo A;
4364     } elsif ($self->{nc} == 0x0027) { # '
4365     !!!parse-error (type => 'no space before default value'); ## TODO: type
4366     $self->{ca}->{value} = '';
4367     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4368     !!!next-input-character;
4369     redo A;
4370     } elsif ($self->{nc} == 0x003E) { # >
4371     !!!parse-error (type => 'no attr default'); ## TODO: type
4372     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4373     !!!next-input-character;
4374     !!!emit ($self->{ct}); # ATTLIST
4375     redo A;
4376     } elsif ($self->{nc} == -1) {
4377     !!!parse-error (type => 'unclosed md'); ## TODO: type
4378     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4379     !!!next-input-character;
4380     !!!emit ($self->{ct});
4381     redo A;
4382     } else {
4383     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4384     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4385     ## Reconsume.
4386     redo A;
4387     }
4388     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4389     if ($is_space->{$self->{nc}}) {
4390     ## Stay in the state.
4391     !!!next-input-character;
4392     redo A;
4393     } elsif ($self->{nc} == 0x0023) { # #
4394     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4395     !!!next-input-character;
4396     redo A;
4397     } elsif ($self->{nc} == 0x0022) { # "
4398     $self->{ca}->{value} = '';
4399     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4400     !!!next-input-character;
4401     redo A;
4402     } elsif ($self->{nc} == 0x0027) { # '
4403     $self->{ca}->{value} = '';
4404     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4405     !!!next-input-character;
4406     redo A;
4407     } elsif ($self->{nc} == 0x003E) { # >
4408     !!!parse-error (type => 'no attr default'); ## TODO: type
4409     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4410     !!!next-input-character;
4411     !!!emit ($self->{ct}); # ATTLIST
4412     redo A;
4413     } elsif ($self->{nc} == -1) {
4414     !!!parse-error (type => 'unclosed md'); ## TODO: type
4415     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4416     !!!next-input-character;
4417     !!!emit ($self->{ct});
4418     redo A;
4419     } else {
4420     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4421     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4422     ## Reconsume.
4423     redo A;
4424     }
4425     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4426     if ($is_space->{$self->{nc}}) {
4427     ## XML5: No parse error.
4428     !!!parse-error (type => 'no default type'); ## TODO: type
4429 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4430 wakaba 1.14 ## Reconsume.
4431     redo A;
4432 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4433     ## XML5: Same as "anything else".
4434     $self->{ca}->{value} = '';
4435     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4436     !!!next-input-character;
4437     redo A;
4438     } elsif ($self->{nc} == 0x0027) { # '
4439     ## XML5: Same as "anything else".
4440     $self->{ca}->{value} = '';
4441     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4442     !!!next-input-character;
4443     redo A;
4444     } elsif ($self->{nc} == 0x003E) { # >
4445     ## XML5: Same as "anything else".
4446     !!!parse-error (type => 'no attr default'); ## TODO: type
4447     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4448     !!!next-input-character;
4449     !!!emit ($self->{ct}); # ATTLIST
4450     redo A;
4451     } elsif ($self->{nc} == -1) {
4452     ## XML5: No parse error.
4453     !!!parse-error (type => 'unclosed md'); ## TODO: type
4454     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4455     !!!next-input-character;
4456     !!!emit ($self->{ct});
4457     redo A;
4458     } else {
4459     $self->{ca}->{default} = chr $self->{nc};
4460     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4461     !!!next-input-character;
4462     redo A;
4463 wakaba 1.14 }
4464 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4465     if ($is_space->{$self->{nc}}) {
4466     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4467     !!!next-input-character;
4468     redo A;
4469     } elsif ($self->{nc} == 0x0022) { # "
4470     ## XML5: Same as "anything else".
4471     !!!parse-error (type => 'no space before default value'); ## TODO: type
4472     $self->{ca}->{value} = '';
4473     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4474     !!!next-input-character;
4475     redo A;
4476     } elsif ($self->{nc} == 0x0027) { # '
4477     ## XML5: Same as "anything else".
4478     !!!parse-error (type => 'no space before default value'); ## TODO: type
4479     $self->{ca}->{value} = '';
4480     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4481     !!!next-input-character;
4482     redo A;
4483     } elsif ($self->{nc} == 0x003E) { # >
4484     ## XML5: Same as "anything else".
4485     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4486     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4487     !!!next-input-character;
4488     !!!emit ($self->{ct}); # ATTLIST
4489     redo A;
4490     } elsif ($self->{nc} == -1) {
4491     ## XML5: No parse error.
4492     !!!parse-error (type => 'unclosed md'); ## TODO: type
4493     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4494     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4495     !!!next-input-character;
4496     !!!emit ($self->{ct});
4497     redo A;
4498     } else {
4499     $self->{ca}->{default} .= chr $self->{nc};
4500     ## Stay in the state.
4501     !!!next-input-character;
4502     redo A;
4503     }
4504     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4505     if ($is_space->{$self->{nc}}) {
4506     ## Stay in the state.
4507     !!!next-input-character;
4508     redo A;
4509     } elsif ($self->{nc} == 0x0022) { # "
4510     $self->{ca}->{value} = '';
4511     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512     !!!next-input-character;
4513     redo A;
4514     } elsif ($self->{nc} == 0x0027) { # '
4515     $self->{ca}->{value} = '';
4516     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4517     !!!next-input-character;
4518     redo A;
4519     } elsif ($self->{nc} == 0x003E) { # >
4520     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4521     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4522     !!!next-input-character;
4523     !!!emit ($self->{ct}); # ATTLIST
4524     redo A;
4525     } elsif ($self->{nc} == -1) {
4526     ## XML5: No parse error.
4527     !!!parse-error (type => 'unclosed md'); ## TODO: type
4528     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4529     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4530     !!!next-input-character;
4531     !!!emit ($self->{ct});
4532     redo A;
4533     } else {
4534     ## XML5: Not defined yet.
4535     if ($self->{ca}->{default} eq 'FIXED') {
4536     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4537     } else {
4538     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4540     }
4541     ## Reconsume.
4542     redo A;
4543     }
4544     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4545     if ($is_space->{$self->{nc}} or
4546     $self->{nc} == -1 or
4547     $self->{nc} == 0x003E) { # >
4548     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4549     ## Reconsume.
4550     redo A;
4551     } else {
4552     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4553     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4554     ## Reconsume.
4555     redo A;
4556 wakaba 1.16 }
4557 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4558     ## ASCII case-insensitive
4559     if ($self->{nc} == [
4560     undef,
4561     0x0044, # D
4562     0x0041, # A
4563     0x0054, # T
4564     ]->[length $self->{kwd}] or
4565     $self->{nc} == [
4566     undef,
4567     0x0064, # d
4568     0x0061, # a
4569     0x0074, # t
4570     ]->[length $self->{kwd}]) {
4571     !!!cp (172.2);
4572     ## Stay in the state.
4573     $self->{kwd} .= chr $self->{nc};
4574     !!!next-input-character;
4575     redo A;
4576     } elsif ((length $self->{kwd}) == 4 and
4577     ($self->{nc} == 0x0041 or # A
4578     $self->{nc} == 0x0061)) { # a
4579     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4580     !!!cp (172.3);
4581     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4582     text => 'NDATA',
4583     line => $self->{line_prev},
4584     column => $self->{column_prev} - 4);
4585     } else {
4586     !!!cp (172.4);
4587     }
4588     $self->{state} = AFTER_NDATA_STATE;
4589     !!!next-input-character;
4590     redo A;
4591     } else {
4592     !!!parse-error (type => 'string after literal', ## TODO: type
4593     line => $self->{line_prev},
4594     column => $self->{column_prev} + 1
4595     - length $self->{kwd});
4596     !!!cp (172.5);
4597     $self->{state} = BOGUS_MD_STATE;
4598     ## Reconsume.
4599     redo A;
4600     }
4601     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4602     if ($is_space->{$self->{nc}}) {
4603     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4604     !!!next-input-character;
4605     redo A;
4606     } elsif ($self->{nc} == 0x003E) { # >
4607     !!!parse-error (type => 'no notation name'); ## TODO: type
4608     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609     !!!next-input-character;
4610     !!!emit ($self->{ct}); # ENTITY
4611     redo A;
4612     } elsif ($self->{nc} == -1) {
4613     !!!parse-error (type => 'unclosed md'); ## TODO: type
4614     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4615     !!!next-input-character;
4616     !!!emit ($self->{ct}); # ENTITY
4617     redo A;
4618     } else {
4619     !!!parse-error (type => 'string after literal', ## TODO: type
4620     line => $self->{line_prev},
4621     column => $self->{column_prev} + 1
4622     - length $self->{kwd});
4623     $self->{state} = BOGUS_MD_STATE;
4624     ## Reconsume.
4625     redo A;
4626     }
4627     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4628     if ($is_space->{$self->{nc}}) {
4629     ## Stay in the state.
4630     !!!next-input-character;
4631     redo A;
4632     } elsif ($self->{nc} == 0x003E) { # >
4633     !!!parse-error (type => 'no notation name'); ## TODO: type
4634     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4635     !!!next-input-character;
4636     !!!emit ($self->{ct}); # ENTITY
4637     redo A;
4638     } elsif ($self->{nc} == -1) {
4639     !!!parse-error (type => 'unclosed md'); ## TODO: type
4640     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4641     !!!next-input-character;
4642     !!!emit ($self->{ct}); # ENTITY
4643     redo A;
4644     } else {
4645     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4646     $self->{state} = NOTATION_NAME_STATE;
4647     !!!next-input-character;
4648     redo A;
4649     }
4650     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4651     if ($is_space->{$self->{nc}}) {
4652     $self->{state} = AFTER_NOTATION_NAME_STATE;
4653     !!!next-input-character;
4654     redo A;
4655     } elsif ($self->{nc} == 0x003E) { # >
4656     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4657     !!!next-input-character;
4658     !!!emit ($self->{ct}); # ENTITY
4659     redo A;
4660     } elsif ($self->{nc} == -1) {
4661     !!!parse-error (type => 'unclosed md'); ## TODO: type
4662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4663     !!!next-input-character;
4664     !!!emit ($self->{ct}); # ENTITY
4665     redo A;
4666     } else {
4667     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4668     ## Stay in the state.
4669     !!!next-input-character;
4670     redo A;
4671     }
4672     } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4673     if ($is_space->{$self->{nc}}) {
4674     ## Stay in the state.
4675     !!!next-input-character;
4676     redo A;
4677     } elsif ($self->{nc} == 0x003E) { # >
4678     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4679     !!!next-input-character;
4680     !!!emit ($self->{ct}); # ENTITY
4681     redo A;
4682     } elsif ($self->{nc} == -1) {
4683     !!!parse-error (type => 'unclosed md'); ## TODO: type
4684     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4685     !!!next-input-character;
4686     !!!emit ($self->{ct}); # ENTITY
4687     redo A;
4688     } else {
4689     !!!parse-error (type => 'string after notation name'); ## TODO: type
4690     $self->{state} = BOGUS_MD_STATE;
4691     ## Reconsume.
4692     redo A;
4693     }
4694    
4695 wakaba 1.16
4696     } elsif ($self->{state} == BOGUS_MD_STATE) {
4697     if ($self->{nc} == 0x003E) { # >
4698     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4699     !!!next-input-character;
4700     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4701     redo A;
4702     } elsif ($self->{nc} == -1) {
4703     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4704     ## Reconsume.
4705     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4706     redo A;
4707     } else {
4708     ## Stay in the state.
4709     !!!next-input-character;
4710     redo A;
4711     }
4712 wakaba 1.1 } else {
4713     die "$0: $self->{state}: Unknown state";
4714     }
4715     } # A
4716    
4717     die "$0: _get_next_token: unexpected case";
4718     } # _get_next_token
4719    
4720     1;
4721 wakaba 1.18 ## $Date: 2008/10/19 04:39:25 $
4722 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24