/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (hide annotations) (download) (as text)
Sun Oct 19 10:12:54 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.21: +25 -16 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	19 Oct 2008 10:12:26 -0000
	* XML-Parser.t: "xml/entrefs-2.dat" added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 10:12:39 -0000
	* entrefs-2.dat: New test data file.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 10:11:55 -0000
	* Tokenizer.pm.src: Raise a parse error for '&' that does not
	introduce a reference in XML.  Support for non-ASCII entity
	reference names.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.22 our $VERSION=do{my @r=(q$Revision: 1.21 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951     0x003D => 1, # =
952     }->{$self->{nc}}) {
953     !!!cp (55);
954 wakaba 1.11 ## XML5: Not a parse error.
955 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
956     } else {
957     !!!cp (56);
958 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
959 wakaba 1.1 }
960     $self->{ca}
961     = {name => chr ($self->{nc}),
962     value => '',
963     line => $self->{line}, column => $self->{column}};
964     $self->{state} = ATTRIBUTE_NAME_STATE;
965     !!!next-input-character;
966     redo A;
967     }
968     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 wakaba 1.11 ## XML5: "Tag attribute name state".
970    
971 wakaba 1.1 my $before_leave = sub {
972     if (exists $self->{ct}->{attributes} # start tag or end tag
973     ->{$self->{ca}->{name}}) { # MUST
974     !!!cp (57);
975     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976     ## Discard $self->{ca} # MUST
977     } else {
978     !!!cp (58);
979     $self->{ct}->{attributes}->{$self->{ca}->{name}}
980     = $self->{ca};
981 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 wakaba 1.1 }
983     }; # $before_leave
984    
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (59);
987     $before_leave->();
988     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003D) { # =
992     !!!cp (60);
993     $before_leave->();
994     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995     !!!next-input-character;
996     redo A;
997     } elsif ($self->{nc} == 0x003E) { # >
998 wakaba 1.11 if ($self->{is_xml}) {
999     !!!cp (60.1);
1000     ## XML5: Not a parse error.
1001     !!!parse-error (type => 'no attr value'); ## TODO: type
1002     } else {
1003     !!!cp (60.2);
1004     }
1005    
1006 wakaba 1.1 $before_leave->();
1007     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008     !!!cp (61);
1009     $self->{last_stag_name} = $self->{ct}->{tag_name};
1010     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011     !!!cp (62);
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014     !!!parse-error (type => 'end tag attribute');
1015     }
1016     } else {
1017     die "$0: $self->{ct}->{type}: Unknown token type";
1018     }
1019     $self->{state} = DATA_STATE;
1020 wakaba 1.5 $self->{s_kwd} = '';
1021 wakaba 1.1 !!!next-input-character;
1022    
1023     !!!emit ($self->{ct}); # start tag or end tag
1024    
1025     redo A;
1026     } elsif (0x0041 <= $self->{nc} and
1027     $self->{nc} <= 0x005A) { # A..Z
1028     !!!cp (63);
1029 wakaba 1.4 $self->{ca}->{name}
1030     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 wakaba 1.1 ## Stay in the state
1032     !!!next-input-character;
1033     redo A;
1034     } elsif ($self->{nc} == 0x002F) { # /
1035 wakaba 1.11 if ($self->{is_xml}) {
1036     !!!cp (64);
1037     ## XML5: Not a parse error.
1038     !!!parse-error (type => 'no attr value'); ## TODO: type
1039     } else {
1040     !!!cp (64.1);
1041     }
1042    
1043 wakaba 1.1 $before_leave->();
1044     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045     !!!next-input-character;
1046     redo A;
1047     } elsif ($self->{nc} == -1) {
1048     !!!parse-error (type => 'unclosed tag');
1049     $before_leave->();
1050     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051     !!!cp (66);
1052     $self->{last_stag_name} = $self->{ct}->{tag_name};
1053     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055     if ($self->{ct}->{attributes}) {
1056     !!!cp (67);
1057     !!!parse-error (type => 'end tag attribute');
1058     } else {
1059     ## NOTE: This state should never be reached.
1060     !!!cp (68);
1061     }
1062     } else {
1063     die "$0: $self->{ct}->{type}: Unknown token type";
1064     }
1065     $self->{state} = DATA_STATE;
1066 wakaba 1.5 $self->{s_kwd} = '';
1067 wakaba 1.1 # reconsume
1068    
1069     !!!emit ($self->{ct}); # start tag or end tag
1070    
1071     redo A;
1072     } else {
1073     if ($self->{nc} == 0x0022 or # "
1074     $self->{nc} == 0x0027) { # '
1075     !!!cp (69);
1076 wakaba 1.11 ## XML5: Not a parse error.
1077 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1078     } else {
1079     !!!cp (70);
1080     }
1081     $self->{ca}->{name} .= chr ($self->{nc});
1082     ## Stay in the state
1083     !!!next-input-character;
1084     redo A;
1085     }
1086     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 wakaba 1.11 ## XML5: "Tag attribute name after state".
1088    
1089 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1090     !!!cp (71);
1091     ## Stay in the state
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003D) { # =
1095     !!!cp (72);
1096     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003E) { # >
1100 wakaba 1.11 if ($self->{is_xml}) {
1101     !!!cp (72.1);
1102     ## XML5: Not a parse error.
1103     !!!parse-error (type => 'no attr value'); ## TODO: type
1104     } else {
1105     !!!cp (72.2);
1106     }
1107    
1108 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109     !!!cp (73);
1110     $self->{last_stag_name} = $self->{ct}->{tag_name};
1111     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     !!!cp (74);
1115     !!!parse-error (type => 'end tag attribute');
1116     } else {
1117     ## NOTE: This state should never be reached.
1118     !!!cp (75);
1119     }
1120     } else {
1121     die "$0: $self->{ct}->{type}: Unknown token type";
1122     }
1123     $self->{state} = DATA_STATE;
1124 wakaba 1.5 $self->{s_kwd} = '';
1125 wakaba 1.1 !!!next-input-character;
1126    
1127     !!!emit ($self->{ct}); # start tag or end tag
1128    
1129     redo A;
1130     } elsif (0x0041 <= $self->{nc} and
1131     $self->{nc} <= 0x005A) { # A..Z
1132     !!!cp (76);
1133     $self->{ca}
1134 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 wakaba 1.1 value => '',
1136     line => $self->{line}, column => $self->{column}};
1137     $self->{state} = ATTRIBUTE_NAME_STATE;
1138     !!!next-input-character;
1139     redo A;
1140     } elsif ($self->{nc} == 0x002F) { # /
1141 wakaba 1.11 if ($self->{is_xml}) {
1142     !!!cp (77);
1143     ## XML5: Not a parse error.
1144     !!!parse-error (type => 'no attr value'); ## TODO: type
1145     } else {
1146     !!!cp (77.1);
1147     }
1148    
1149 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150     !!!next-input-character;
1151     redo A;
1152     } elsif ($self->{nc} == -1) {
1153     !!!parse-error (type => 'unclosed tag');
1154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155     !!!cp (79);
1156     $self->{last_stag_name} = $self->{ct}->{tag_name};
1157     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159     if ($self->{ct}->{attributes}) {
1160     !!!cp (80);
1161     !!!parse-error (type => 'end tag attribute');
1162     } else {
1163     ## NOTE: This state should never be reached.
1164     !!!cp (81);
1165     }
1166     } else {
1167     die "$0: $self->{ct}->{type}: Unknown token type";
1168     }
1169 wakaba 1.5 $self->{s_kwd} = '';
1170 wakaba 1.1 $self->{state} = DATA_STATE;
1171     # reconsume
1172    
1173     !!!emit ($self->{ct}); # start tag or end tag
1174    
1175     redo A;
1176     } else {
1177 wakaba 1.11 if ($self->{is_xml}) {
1178     !!!cp (78.1);
1179     ## XML5: Not a parse error.
1180     !!!parse-error (type => 'no attr value'); ## TODO: type
1181     } else {
1182     !!!cp (78.2);
1183     }
1184    
1185 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1186     $self->{nc} == 0x0027) { # '
1187     !!!cp (78);
1188 wakaba 1.11 ## XML5: Not a parse error.
1189 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1190     } else {
1191     !!!cp (82);
1192     }
1193     $self->{ca}
1194     = {name => chr ($self->{nc}),
1195     value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198     !!!next-input-character;
1199     redo A;
1200     }
1201     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 wakaba 1.11 ## XML5: "Tag attribute value before state".
1203    
1204 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1205     !!!cp (83);
1206     ## Stay in the state
1207     !!!next-input-character;
1208     redo A;
1209     } elsif ($self->{nc} == 0x0022) { # "
1210     !!!cp (84);
1211     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212     !!!next-input-character;
1213     redo A;
1214     } elsif ($self->{nc} == 0x0026) { # &
1215     !!!cp (85);
1216     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217     ## reconsume
1218     redo A;
1219     } elsif ($self->{nc} == 0x0027) { # '
1220     !!!cp (86);
1221     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222     !!!next-input-character;
1223     redo A;
1224     } elsif ($self->{nc} == 0x003E) { # >
1225     !!!parse-error (type => 'empty unquoted attribute value');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227     !!!cp (87);
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232     !!!cp (88);
1233     !!!parse-error (type => 'end tag attribute');
1234     } else {
1235     ## NOTE: This state should never be reached.
1236     !!!cp (89);
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 !!!next-input-character;
1244    
1245     !!!emit ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } elsif ($self->{nc} == -1) {
1249     !!!parse-error (type => 'unclosed tag');
1250     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251     !!!cp (90);
1252     $self->{last_stag_name} = $self->{ct}->{tag_name};
1253     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255     if ($self->{ct}->{attributes}) {
1256     !!!cp (91);
1257     !!!parse-error (type => 'end tag attribute');
1258     } else {
1259     ## NOTE: This state should never be reached.
1260     !!!cp (92);
1261     }
1262     } else {
1263     die "$0: $self->{ct}->{type}: Unknown token type";
1264     }
1265     $self->{state} = DATA_STATE;
1266 wakaba 1.5 $self->{s_kwd} = '';
1267 wakaba 1.1 ## reconsume
1268    
1269     !!!emit ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } else {
1273     if ($self->{nc} == 0x003D) { # =
1274     !!!cp (93);
1275 wakaba 1.11 ## XML5: Not a parse error.
1276 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1277 wakaba 1.11 } elsif ($self->{is_xml}) {
1278     !!!cp (93.1);
1279     ## XML5: No parse error.
1280     !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 wakaba 1.1 } else {
1282     !!!cp (94);
1283     }
1284     $self->{ca}->{value} .= chr ($self->{nc});
1285     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286     !!!next-input-character;
1287     redo A;
1288     }
1289     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291     ## ATTLIST attribute value double quoted state".
1292 wakaba 1.11
1293 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1294 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295     !!!cp (95.1);
1296     ## XML5: "DOCTYPE ATTLIST name after state".
1297     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299     } else {
1300     !!!cp (95);
1301     ## XML5: "Tag attribute name before state".
1302     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303     }
1304 wakaba 1.1 !!!next-input-character;
1305     redo A;
1306     } elsif ($self->{nc} == 0x0026) { # &
1307     !!!cp (96);
1308 wakaba 1.11 ## XML5: Not defined yet.
1309    
1310 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1311     ## "entity in attribute value state". In this implementation, the
1312     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313     ## implementation of the "consume a character reference" algorithm.
1314     $self->{prev_state} = $self->{state};
1315     $self->{entity_add} = 0x0022; # "
1316     $self->{state} = ENTITY_STATE;
1317     !!!next-input-character;
1318     redo A;
1319     } elsif ($self->{nc} == -1) {
1320     !!!parse-error (type => 'unclosed attribute value');
1321     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322     !!!cp (97);
1323     $self->{last_stag_name} = $self->{ct}->{tag_name};
1324 wakaba 1.15
1325     $self->{state} = DATA_STATE;
1326     $self->{s_kwd} = '';
1327     ## reconsume
1328     !!!emit ($self->{ct}); # start tag
1329     redo A;
1330 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1331     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1332     if ($self->{ct}->{attributes}) {
1333     !!!cp (98);
1334     !!!parse-error (type => 'end tag attribute');
1335     } else {
1336     ## NOTE: This state should never be reached.
1337     !!!cp (99);
1338     }
1339 wakaba 1.15
1340     $self->{state} = DATA_STATE;
1341     $self->{s_kwd} = '';
1342     ## reconsume
1343     !!!emit ($self->{ct}); # end tag
1344     redo A;
1345     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1346     ## XML5: No parse error above; not defined yet.
1347     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1348     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1349     ## Reconsume.
1350     !!!emit ($self->{ct}); # ATTLIST
1351     redo A;
1352 wakaba 1.1 } else {
1353     die "$0: $self->{ct}->{type}: Unknown token type";
1354     }
1355     } else {
1356 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1357 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1358     !!!cp (100);
1359     ## XML5: Not a parse error.
1360     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1361     } else {
1362     !!!cp (100.1);
1363     }
1364 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1365     $self->{read_until}->($self->{ca}->{value},
1366 wakaba 1.11 q["&<],
1367 wakaba 1.1 length $self->{ca}->{value});
1368    
1369     ## Stay in the state
1370     !!!next-input-character;
1371     redo A;
1372     }
1373     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1374 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1375     ## ATTLIST attribute value single quoted state".
1376 wakaba 1.11
1377 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1378 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1379     !!!cp (101.1);
1380     ## XML5: "DOCTYPE ATTLIST name after state".
1381     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1382     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1383     } else {
1384     !!!cp (101);
1385     ## XML5: "Before attribute name state" (sic).
1386     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1387     }
1388 wakaba 1.1 !!!next-input-character;
1389     redo A;
1390     } elsif ($self->{nc} == 0x0026) { # &
1391     !!!cp (102);
1392 wakaba 1.11 ## XML5: Not defined yet.
1393    
1394 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1395     ## "entity in attribute value state". In this implementation, the
1396     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1397     ## implementation of the "consume a character reference" algorithm.
1398     $self->{entity_add} = 0x0027; # '
1399     $self->{prev_state} = $self->{state};
1400     $self->{state} = ENTITY_STATE;
1401     !!!next-input-character;
1402     redo A;
1403     } elsif ($self->{nc} == -1) {
1404     !!!parse-error (type => 'unclosed attribute value');
1405     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1406     !!!cp (103);
1407     $self->{last_stag_name} = $self->{ct}->{tag_name};
1408 wakaba 1.15
1409     $self->{state} = DATA_STATE;
1410     $self->{s_kwd} = '';
1411     ## reconsume
1412     !!!emit ($self->{ct}); # start tag
1413     redo A;
1414 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1415     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1416     if ($self->{ct}->{attributes}) {
1417     !!!cp (104);
1418     !!!parse-error (type => 'end tag attribute');
1419     } else {
1420     ## NOTE: This state should never be reached.
1421     !!!cp (105);
1422     }
1423 wakaba 1.15
1424     $self->{state} = DATA_STATE;
1425     $self->{s_kwd} = '';
1426     ## reconsume
1427     !!!emit ($self->{ct}); # end tag
1428     redo A;
1429     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1430     ## XML5: No parse error above; not defined yet.
1431     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1432     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1433     ## Reconsume.
1434     !!!emit ($self->{ct}); # ATTLIST
1435     redo A;
1436 wakaba 1.1 } else {
1437     die "$0: $self->{ct}->{type}: Unknown token type";
1438     }
1439     } else {
1440 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1441 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1442     !!!cp (106);
1443     ## XML5: Not a parse error.
1444     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1445     } else {
1446     !!!cp (106.1);
1447     }
1448 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1449     $self->{read_until}->($self->{ca}->{value},
1450 wakaba 1.11 q['&<],
1451 wakaba 1.1 length $self->{ca}->{value});
1452    
1453     ## Stay in the state
1454     !!!next-input-character;
1455     redo A;
1456     }
1457     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1458 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1459    
1460 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1461 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1462     !!!cp (107.1);
1463     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1464     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1465     } else {
1466     !!!cp (107);
1467     ## XML5: "Tag attribute name before state".
1468     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1469     }
1470 wakaba 1.1 !!!next-input-character;
1471     redo A;
1472     } elsif ($self->{nc} == 0x0026) { # &
1473     !!!cp (108);
1474 wakaba 1.11
1475     ## XML5: Not defined yet.
1476    
1477 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1478     ## "entity in attribute value state". In this implementation, the
1479     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1480     ## implementation of the "consume a character reference" algorithm.
1481     $self->{entity_add} = -1;
1482     $self->{prev_state} = $self->{state};
1483     $self->{state} = ENTITY_STATE;
1484     !!!next-input-character;
1485     redo A;
1486     } elsif ($self->{nc} == 0x003E) { # >
1487     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1488     !!!cp (109);
1489     $self->{last_stag_name} = $self->{ct}->{tag_name};
1490 wakaba 1.15
1491     $self->{state} = DATA_STATE;
1492     $self->{s_kwd} = '';
1493     !!!next-input-character;
1494     !!!emit ($self->{ct}); # start tag
1495     redo A;
1496 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1497     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1498     if ($self->{ct}->{attributes}) {
1499     !!!cp (110);
1500     !!!parse-error (type => 'end tag attribute');
1501     } else {
1502     ## NOTE: This state should never be reached.
1503     !!!cp (111);
1504     }
1505 wakaba 1.15
1506     $self->{state} = DATA_STATE;
1507     $self->{s_kwd} = '';
1508     !!!next-input-character;
1509     !!!emit ($self->{ct}); # end tag
1510     redo A;
1511     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1512     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1514     !!!next-input-character;
1515     !!!emit ($self->{ct}); # ATTLIST
1516     redo A;
1517 wakaba 1.1 } else {
1518     die "$0: $self->{ct}->{type}: Unknown token type";
1519     }
1520     } elsif ($self->{nc} == -1) {
1521     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522     !!!cp (112);
1523 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1524 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1525 wakaba 1.15
1526     $self->{state} = DATA_STATE;
1527     $self->{s_kwd} = '';
1528     ## reconsume
1529     !!!emit ($self->{ct}); # start tag
1530     redo A;
1531 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1532 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1533 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1534     if ($self->{ct}->{attributes}) {
1535     !!!cp (113);
1536     !!!parse-error (type => 'end tag attribute');
1537     } else {
1538     ## NOTE: This state should never be reached.
1539     !!!cp (114);
1540     }
1541 wakaba 1.15
1542     $self->{state} = DATA_STATE;
1543     $self->{s_kwd} = '';
1544     ## reconsume
1545     !!!emit ($self->{ct}); # end tag
1546     redo A;
1547     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1548     !!!parse-error (type => 'unclosed md'); ## TODO: type
1549     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1551     ## Reconsume.
1552     !!!emit ($self->{ct}); # ATTLIST
1553     redo A;
1554 wakaba 1.1 } else {
1555     die "$0: $self->{ct}->{type}: Unknown token type";
1556     }
1557     } else {
1558     if ({
1559     0x0022 => 1, # "
1560     0x0027 => 1, # '
1561     0x003D => 1, # =
1562     }->{$self->{nc}}) {
1563     !!!cp (115);
1564 wakaba 1.11 ## XML5: Not a parse error.
1565 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1566     } else {
1567     !!!cp (116);
1568     }
1569     $self->{ca}->{value} .= chr ($self->{nc});
1570     $self->{read_until}->($self->{ca}->{value},
1571     q["'=& >],
1572     length $self->{ca}->{value});
1573    
1574     ## Stay in the state
1575     !!!next-input-character;
1576     redo A;
1577     }
1578     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1579     if ($is_space->{$self->{nc}}) {
1580     !!!cp (118);
1581     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1582     !!!next-input-character;
1583     redo A;
1584     } elsif ($self->{nc} == 0x003E) { # >
1585     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1586     !!!cp (119);
1587     $self->{last_stag_name} = $self->{ct}->{tag_name};
1588     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1589     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1590     if ($self->{ct}->{attributes}) {
1591     !!!cp (120);
1592     !!!parse-error (type => 'end tag attribute');
1593     } else {
1594     ## NOTE: This state should never be reached.
1595     !!!cp (121);
1596     }
1597     } else {
1598     die "$0: $self->{ct}->{type}: Unknown token type";
1599     }
1600     $self->{state} = DATA_STATE;
1601 wakaba 1.5 $self->{s_kwd} = '';
1602 wakaba 1.1 !!!next-input-character;
1603    
1604     !!!emit ($self->{ct}); # start tag or end tag
1605    
1606     redo A;
1607     } elsif ($self->{nc} == 0x002F) { # /
1608     !!!cp (122);
1609     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1610     !!!next-input-character;
1611     redo A;
1612     } elsif ($self->{nc} == -1) {
1613     !!!parse-error (type => 'unclosed tag');
1614     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1615     !!!cp (122.3);
1616     $self->{last_stag_name} = $self->{ct}->{tag_name};
1617     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1618     if ($self->{ct}->{attributes}) {
1619     !!!cp (122.1);
1620     !!!parse-error (type => 'end tag attribute');
1621     } else {
1622     ## NOTE: This state should never be reached.
1623     !!!cp (122.2);
1624     }
1625     } else {
1626     die "$0: $self->{ct}->{type}: Unknown token type";
1627     }
1628     $self->{state} = DATA_STATE;
1629 wakaba 1.5 $self->{s_kwd} = '';
1630 wakaba 1.1 ## Reconsume.
1631     !!!emit ($self->{ct}); # start tag or end tag
1632     redo A;
1633     } else {
1634     !!!cp ('124.1');
1635     !!!parse-error (type => 'no space between attributes');
1636     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1637     ## reconsume
1638     redo A;
1639     }
1640     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1641 wakaba 1.11 ## XML5: "Empty tag state".
1642    
1643 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1644     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1645     !!!cp ('124.2');
1646     !!!parse-error (type => 'nestc', token => $self->{ct});
1647     ## TODO: Different type than slash in start tag
1648     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1649     if ($self->{ct}->{attributes}) {
1650     !!!cp ('124.4');
1651     !!!parse-error (type => 'end tag attribute');
1652     } else {
1653     !!!cp ('124.5');
1654     }
1655     ## TODO: Test |<title></title/>|
1656     } else {
1657     !!!cp ('124.3');
1658     $self->{self_closing} = 1;
1659     }
1660    
1661     $self->{state} = DATA_STATE;
1662 wakaba 1.5 $self->{s_kwd} = '';
1663 wakaba 1.1 !!!next-input-character;
1664    
1665     !!!emit ($self->{ct}); # start tag or end tag
1666    
1667     redo A;
1668     } elsif ($self->{nc} == -1) {
1669     !!!parse-error (type => 'unclosed tag');
1670     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1671     !!!cp (124.7);
1672     $self->{last_stag_name} = $self->{ct}->{tag_name};
1673     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1674     if ($self->{ct}->{attributes}) {
1675     !!!cp (124.5);
1676     !!!parse-error (type => 'end tag attribute');
1677     } else {
1678     ## NOTE: This state should never be reached.
1679     !!!cp (124.6);
1680     }
1681     } else {
1682     die "$0: $self->{ct}->{type}: Unknown token type";
1683     }
1684 wakaba 1.11 ## XML5: "Tag attribute name before state".
1685 wakaba 1.1 $self->{state} = DATA_STATE;
1686 wakaba 1.5 $self->{s_kwd} = '';
1687 wakaba 1.1 ## Reconsume.
1688     !!!emit ($self->{ct}); # start tag or end tag
1689     redo A;
1690     } else {
1691     !!!cp ('124.4');
1692     !!!parse-error (type => 'nestc');
1693     ## TODO: This error type is wrong.
1694     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1695     ## Reconsume.
1696     redo A;
1697     }
1698     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1699 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1700    
1701 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1702     ## consumes characters one-by-one basis.
1703    
1704     if ($self->{nc} == 0x003E) { # >
1705 wakaba 1.13 if ($self->{in_subset}) {
1706     !!!cp (123);
1707     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1708     } else {
1709     !!!cp (124);
1710     $self->{state} = DATA_STATE;
1711     $self->{s_kwd} = '';
1712     }
1713 wakaba 1.1 !!!next-input-character;
1714    
1715     !!!emit ($self->{ct}); # comment
1716     redo A;
1717     } elsif ($self->{nc} == -1) {
1718 wakaba 1.13 if ($self->{in_subset}) {
1719     !!!cp (125.1);
1720     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1721     } else {
1722     !!!cp (125);
1723     $self->{state} = DATA_STATE;
1724     $self->{s_kwd} = '';
1725     }
1726 wakaba 1.1 ## reconsume
1727    
1728     !!!emit ($self->{ct}); # comment
1729     redo A;
1730     } else {
1731     !!!cp (126);
1732     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1733     $self->{read_until}->($self->{ct}->{data},
1734     q[>],
1735     length $self->{ct}->{data});
1736    
1737     ## Stay in the state.
1738     !!!next-input-character;
1739     redo A;
1740     }
1741     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1742 wakaba 1.14 ## XML5: "Markup declaration state".
1743 wakaba 1.1
1744     if ($self->{nc} == 0x002D) { # -
1745     !!!cp (133);
1746     $self->{state} = MD_HYPHEN_STATE;
1747     !!!next-input-character;
1748     redo A;
1749     } elsif ($self->{nc} == 0x0044 or # D
1750     $self->{nc} == 0x0064) { # d
1751     ## ASCII case-insensitive.
1752     !!!cp (130);
1753     $self->{state} = MD_DOCTYPE_STATE;
1754 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1755 wakaba 1.1 !!!next-input-character;
1756     redo A;
1757 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1758     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1759     $self->{is_xml}) and
1760 wakaba 1.1 $self->{nc} == 0x005B) { # [
1761     !!!cp (135.4);
1762     $self->{state} = MD_CDATA_STATE;
1763 wakaba 1.12 $self->{kwd} = '[';
1764 wakaba 1.1 !!!next-input-character;
1765     redo A;
1766     } else {
1767     !!!cp (136);
1768     }
1769    
1770     !!!parse-error (type => 'bogus comment',
1771     line => $self->{line_prev},
1772     column => $self->{column_prev} - 1);
1773     ## Reconsume.
1774     $self->{state} = BOGUS_COMMENT_STATE;
1775     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1776     line => $self->{line_prev},
1777     column => $self->{column_prev} - 1,
1778     };
1779     redo A;
1780     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1781     if ($self->{nc} == 0x002D) { # -
1782     !!!cp (127);
1783     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1784     line => $self->{line_prev},
1785     column => $self->{column_prev} - 2,
1786     };
1787 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1788 wakaba 1.1 !!!next-input-character;
1789     redo A;
1790     } else {
1791     !!!cp (128);
1792     !!!parse-error (type => 'bogus comment',
1793     line => $self->{line_prev},
1794     column => $self->{column_prev} - 2);
1795     $self->{state} = BOGUS_COMMENT_STATE;
1796     ## Reconsume.
1797     $self->{ct} = {type => COMMENT_TOKEN,
1798     data => '-',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 2,
1801     };
1802     redo A;
1803     }
1804     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1805     ## ASCII case-insensitive.
1806     if ($self->{nc} == [
1807     undef,
1808     0x004F, # O
1809     0x0043, # C
1810     0x0054, # T
1811     0x0059, # Y
1812     0x0050, # P
1813 wakaba 1.12 ]->[length $self->{kwd}] or
1814 wakaba 1.1 $self->{nc} == [
1815     undef,
1816     0x006F, # o
1817     0x0063, # c
1818     0x0074, # t
1819     0x0079, # y
1820     0x0070, # p
1821 wakaba 1.12 ]->[length $self->{kwd}]) {
1822 wakaba 1.1 !!!cp (131);
1823     ## Stay in the state.
1824 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1825 wakaba 1.1 !!!next-input-character;
1826     redo A;
1827 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1828 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1829     $self->{nc} == 0x0065)) { # e
1830 wakaba 1.12 if ($self->{is_xml} and
1831     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1832 wakaba 1.10 !!!cp (129);
1833     ## XML5: case-sensitive.
1834     !!!parse-error (type => 'lowercase keyword', ## TODO
1835     text => 'DOCTYPE',
1836     line => $self->{line_prev},
1837     column => $self->{column_prev} - 5);
1838     } else {
1839     !!!cp (129.1);
1840     }
1841 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1842     $self->{ct} = {type => DOCTYPE_TOKEN,
1843     quirks => 1,
1844     line => $self->{line_prev},
1845     column => $self->{column_prev} - 7,
1846     };
1847     !!!next-input-character;
1848     redo A;
1849     } else {
1850     !!!cp (132);
1851     !!!parse-error (type => 'bogus comment',
1852     line => $self->{line_prev},
1853 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1854 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1855     ## Reconsume.
1856     $self->{ct} = {type => COMMENT_TOKEN,
1857 wakaba 1.12 data => $self->{kwd},
1858 wakaba 1.1 line => $self->{line_prev},
1859 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1860 wakaba 1.1 };
1861     redo A;
1862     }
1863     } elsif ($self->{state} == MD_CDATA_STATE) {
1864     if ($self->{nc} == {
1865     '[' => 0x0043, # C
1866     '[C' => 0x0044, # D
1867     '[CD' => 0x0041, # A
1868     '[CDA' => 0x0054, # T
1869     '[CDAT' => 0x0041, # A
1870 wakaba 1.12 }->{$self->{kwd}}) {
1871 wakaba 1.1 !!!cp (135.1);
1872     ## Stay in the state.
1873 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1874 wakaba 1.1 !!!next-input-character;
1875     redo A;
1876 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1877 wakaba 1.1 $self->{nc} == 0x005B) { # [
1878 wakaba 1.6 if ($self->{is_xml} and
1879     not $self->{tainted} and
1880     @{$self->{open_elements} or []} == 0) {
1881 wakaba 1.8 !!!cp (135.2);
1882 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1883     line => $self->{line_prev},
1884     column => $self->{column_prev} - 7);
1885     $self->{tainted} = 1;
1886 wakaba 1.8 } else {
1887     !!!cp (135.21);
1888 wakaba 1.6 }
1889    
1890 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1891     data => '',
1892     line => $self->{line_prev},
1893     column => $self->{column_prev} - 7};
1894     $self->{state} = CDATA_SECTION_STATE;
1895     !!!next-input-character;
1896     redo A;
1897     } else {
1898     !!!cp (135.3);
1899     !!!parse-error (type => 'bogus comment',
1900     line => $self->{line_prev},
1901 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1902 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1903     ## Reconsume.
1904     $self->{ct} = {type => COMMENT_TOKEN,
1905 wakaba 1.12 data => $self->{kwd},
1906 wakaba 1.1 line => $self->{line_prev},
1907 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1908 wakaba 1.1 };
1909     redo A;
1910     }
1911     } elsif ($self->{state} == COMMENT_START_STATE) {
1912     if ($self->{nc} == 0x002D) { # -
1913     !!!cp (137);
1914     $self->{state} = COMMENT_START_DASH_STATE;
1915     !!!next-input-character;
1916     redo A;
1917     } elsif ($self->{nc} == 0x003E) { # >
1918     !!!parse-error (type => 'bogus comment');
1919 wakaba 1.13 if ($self->{in_subset}) {
1920     !!!cp (138.1);
1921     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1922     } else {
1923     !!!cp (138);
1924     $self->{state} = DATA_STATE;
1925     $self->{s_kwd} = '';
1926     }
1927 wakaba 1.1 !!!next-input-character;
1928    
1929     !!!emit ($self->{ct}); # comment
1930    
1931     redo A;
1932     } elsif ($self->{nc} == -1) {
1933     !!!parse-error (type => 'unclosed comment');
1934 wakaba 1.13 if ($self->{in_subset}) {
1935     !!!cp (139.1);
1936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937     } else {
1938     !!!cp (139);
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     }
1942 wakaba 1.1 ## reconsume
1943    
1944     !!!emit ($self->{ct}); # comment
1945    
1946     redo A;
1947     } else {
1948     !!!cp (140);
1949     $self->{ct}->{data} # comment
1950     .= chr ($self->{nc});
1951     $self->{state} = COMMENT_STATE;
1952     !!!next-input-character;
1953     redo A;
1954     }
1955     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1956     if ($self->{nc} == 0x002D) { # -
1957     !!!cp (141);
1958     $self->{state} = COMMENT_END_STATE;
1959     !!!next-input-character;
1960     redo A;
1961     } elsif ($self->{nc} == 0x003E) { # >
1962     !!!parse-error (type => 'bogus comment');
1963 wakaba 1.13 if ($self->{in_subset}) {
1964     !!!cp (142.1);
1965     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1966     } else {
1967     !!!cp (142);
1968     $self->{state} = DATA_STATE;
1969     $self->{s_kwd} = '';
1970     }
1971 wakaba 1.1 !!!next-input-character;
1972    
1973     !!!emit ($self->{ct}); # comment
1974    
1975     redo A;
1976     } elsif ($self->{nc} == -1) {
1977     !!!parse-error (type => 'unclosed comment');
1978 wakaba 1.13 if ($self->{in_subset}) {
1979     !!!cp (143.1);
1980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981     } else {
1982     !!!cp (143);
1983     $self->{state} = DATA_STATE;
1984     $self->{s_kwd} = '';
1985     }
1986 wakaba 1.1 ## reconsume
1987    
1988     !!!emit ($self->{ct}); # comment
1989    
1990     redo A;
1991     } else {
1992     !!!cp (144);
1993     $self->{ct}->{data} # comment
1994     .= '-' . chr ($self->{nc});
1995     $self->{state} = COMMENT_STATE;
1996     !!!next-input-character;
1997     redo A;
1998     }
1999     } elsif ($self->{state} == COMMENT_STATE) {
2000 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2001    
2002 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2003     !!!cp (145);
2004     $self->{state} = COMMENT_END_DASH_STATE;
2005     !!!next-input-character;
2006     redo A;
2007     } elsif ($self->{nc} == -1) {
2008     !!!parse-error (type => 'unclosed comment');
2009 wakaba 1.13 if ($self->{in_subset}) {
2010     !!!cp (146.1);
2011     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2012     } else {
2013     !!!cp (146);
2014     $self->{state} = DATA_STATE;
2015     $self->{s_kwd} = '';
2016     }
2017 wakaba 1.1 ## reconsume
2018    
2019     !!!emit ($self->{ct}); # comment
2020    
2021     redo A;
2022     } else {
2023     !!!cp (147);
2024     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2025     $self->{read_until}->($self->{ct}->{data},
2026     q[-],
2027     length $self->{ct}->{data});
2028    
2029     ## Stay in the state
2030     !!!next-input-character;
2031     redo A;
2032     }
2033     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2034 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2035 wakaba 1.10
2036 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2037     !!!cp (148);
2038     $self->{state} = COMMENT_END_STATE;
2039     !!!next-input-character;
2040     redo A;
2041     } elsif ($self->{nc} == -1) {
2042     !!!parse-error (type => 'unclosed comment');
2043 wakaba 1.13 if ($self->{in_subset}) {
2044     !!!cp (149.1);
2045     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2046     } else {
2047     !!!cp (149);
2048     $self->{state} = DATA_STATE;
2049     $self->{s_kwd} = '';
2050     }
2051 wakaba 1.1 ## reconsume
2052    
2053     !!!emit ($self->{ct}); # comment
2054    
2055     redo A;
2056     } else {
2057     !!!cp (150);
2058     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2059     $self->{state} = COMMENT_STATE;
2060     !!!next-input-character;
2061     redo A;
2062     }
2063     } elsif ($self->{state} == COMMENT_END_STATE) {
2064 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2065    
2066 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2067 wakaba 1.13 if ($self->{in_subset}) {
2068     !!!cp (151.1);
2069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2070     } else {
2071     !!!cp (151);
2072     $self->{state} = DATA_STATE;
2073     $self->{s_kwd} = '';
2074     }
2075 wakaba 1.1 !!!next-input-character;
2076    
2077     !!!emit ($self->{ct}); # comment
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x002D) { # -
2081     !!!cp (152);
2082 wakaba 1.10 ## XML5: Not a parse error.
2083 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2084     line => $self->{line_prev},
2085     column => $self->{column_prev});
2086     $self->{ct}->{data} .= '-'; # comment
2087     ## Stay in the state
2088     !!!next-input-character;
2089     redo A;
2090     } elsif ($self->{nc} == -1) {
2091     !!!parse-error (type => 'unclosed comment');
2092 wakaba 1.13 if ($self->{in_subset}) {
2093     !!!cp (153.1);
2094     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2095     } else {
2096     !!!cp (153);
2097     $self->{state} = DATA_STATE;
2098     $self->{s_kwd} = '';
2099     }
2100 wakaba 1.1 ## reconsume
2101    
2102     !!!emit ($self->{ct}); # comment
2103    
2104     redo A;
2105     } else {
2106     !!!cp (154);
2107 wakaba 1.10 ## XML5: Not a parse error.
2108 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2109     line => $self->{line_prev},
2110     column => $self->{column_prev});
2111     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2112     $self->{state} = COMMENT_STATE;
2113     !!!next-input-character;
2114     redo A;
2115     }
2116     } elsif ($self->{state} == DOCTYPE_STATE) {
2117     if ($is_space->{$self->{nc}}) {
2118     !!!cp (155);
2119     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2120     !!!next-input-character;
2121     redo A;
2122     } else {
2123     !!!cp (156);
2124 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2125 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2126     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2127     ## reconsume
2128     redo A;
2129     }
2130     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2131 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2132    
2133 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2134     !!!cp (157);
2135     ## Stay in the state
2136     !!!next-input-character;
2137     redo A;
2138     } elsif ($self->{nc} == 0x003E) { # >
2139     !!!cp (158);
2140 wakaba 1.12 ## XML5: No parse error.
2141 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2142     $self->{state} = DATA_STATE;
2143 wakaba 1.5 $self->{s_kwd} = '';
2144 wakaba 1.1 !!!next-input-character;
2145    
2146     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2147    
2148     redo A;
2149     } elsif ($self->{nc} == -1) {
2150     !!!cp (159);
2151     !!!parse-error (type => 'no DOCTYPE name');
2152     $self->{state} = DATA_STATE;
2153 wakaba 1.5 $self->{s_kwd} = '';
2154 wakaba 1.1 ## reconsume
2155    
2156     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2157    
2158     redo A;
2159 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2160     !!!cp (159.1);
2161     !!!parse-error (type => 'no DOCTYPE name');
2162     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2163 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2164     $self->{in_subset} = 1;
2165 wakaba 1.12 !!!next-input-character;
2166 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2167 wakaba 1.12 redo A;
2168 wakaba 1.1 } else {
2169     !!!cp (160);
2170     $self->{ct}->{name} = chr $self->{nc};
2171     delete $self->{ct}->{quirks};
2172     $self->{state} = DOCTYPE_NAME_STATE;
2173     !!!next-input-character;
2174     redo A;
2175     }
2176     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2177 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2178    
2179     ## ISSUE: Redundant "First," in the spec.
2180    
2181 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2182     !!!cp (161);
2183     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2184     !!!next-input-character;
2185     redo A;
2186     } elsif ($self->{nc} == 0x003E) { # >
2187     !!!cp (162);
2188     $self->{state} = DATA_STATE;
2189 wakaba 1.5 $self->{s_kwd} = '';
2190 wakaba 1.1 !!!next-input-character;
2191    
2192     !!!emit ($self->{ct}); # DOCTYPE
2193    
2194     redo A;
2195     } elsif ($self->{nc} == -1) {
2196     !!!cp (163);
2197     !!!parse-error (type => 'unclosed DOCTYPE');
2198     $self->{state} = DATA_STATE;
2199 wakaba 1.5 $self->{s_kwd} = '';
2200 wakaba 1.1 ## reconsume
2201    
2202     $self->{ct}->{quirks} = 1;
2203     !!!emit ($self->{ct}); # DOCTYPE
2204    
2205     redo A;
2206 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2207     !!!cp (163.1);
2208     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2209 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2210     $self->{in_subset} = 1;
2211 wakaba 1.12 !!!next-input-character;
2212 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2213 wakaba 1.12 redo A;
2214 wakaba 1.1 } else {
2215     !!!cp (164);
2216     $self->{ct}->{name}
2217     .= chr ($self->{nc}); # DOCTYPE
2218     ## Stay in the state
2219     !!!next-input-character;
2220     redo A;
2221     }
2222     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2223 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2224     ## state", but implemented differently.
2225    
2226 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2227     !!!cp (165);
2228     ## Stay in the state
2229     !!!next-input-character;
2230     redo A;
2231     } elsif ($self->{nc} == 0x003E) { # >
2232 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2233     !!!cp (166);
2234     $self->{state} = DATA_STATE;
2235     $self->{s_kwd} = '';
2236     } else {
2237     !!!cp (166.1);
2238     !!!parse-error (type => 'no md def'); ## TODO: type
2239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240     }
2241    
2242 wakaba 1.1 !!!next-input-character;
2243 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2244 wakaba 1.1 redo A;
2245     } elsif ($self->{nc} == -1) {
2246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2247     !!!cp (167);
2248     !!!parse-error (type => 'unclosed DOCTYPE');
2249     $self->{state} = DATA_STATE;
2250     $self->{s_kwd} = '';
2251     $self->{ct}->{quirks} = 1;
2252     } else {
2253     !!!cp (167.12);
2254     !!!parse-error (type => 'unclosed md'); ## TODO: type
2255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2256     }
2257    
2258     ## Reconsume.
2259     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2260 wakaba 1.1 redo A;
2261     } elsif ($self->{nc} == 0x0050 or # P
2262     $self->{nc} == 0x0070) { # p
2263 wakaba 1.12 !!!cp (167.1);
2264 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2265 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2266 wakaba 1.1 !!!next-input-character;
2267     redo A;
2268     } elsif ($self->{nc} == 0x0053 or # S
2269     $self->{nc} == 0x0073) { # s
2270 wakaba 1.12 !!!cp (167.2);
2271 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2272 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2273     !!!next-input-character;
2274     redo A;
2275 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2276     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2277     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2278     !!!cp (167.21);
2279     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2280     $self->{ct}->{value} = ''; # ENTITY
2281     !!!next-input-character;
2282     redo A;
2283     } elsif ($self->{nc} == 0x0027 and # '
2284     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2285     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2286     !!!cp (167.22);
2287     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2288     $self->{ct}->{value} = ''; # ENTITY
2289     !!!next-input-character;
2290     redo A;
2291 wakaba 1.16 } elsif ($self->{is_xml} and
2292     $self->{ct}->{type} == DOCTYPE_TOKEN and
2293     $self->{nc} == 0x005B) { # [
2294 wakaba 1.12 !!!cp (167.3);
2295     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2296     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2297 wakaba 1.13 $self->{in_subset} = 1;
2298 wakaba 1.1 !!!next-input-character;
2299 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2300 wakaba 1.1 redo A;
2301     } else {
2302 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2303    
2304     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2305     !!!cp (180);
2306     $self->{ct}->{quirks} = 1;
2307     $self->{state} = BOGUS_DOCTYPE_STATE;
2308     } else {
2309     !!!cp (180.1);
2310     $self->{state} = BOGUS_MD_STATE;
2311     }
2312 wakaba 1.1
2313     !!!next-input-character;
2314     redo A;
2315     }
2316     } elsif ($self->{state} == PUBLIC_STATE) {
2317     ## ASCII case-insensitive
2318     if ($self->{nc} == [
2319     undef,
2320     0x0055, # U
2321     0x0042, # B
2322     0x004C, # L
2323     0x0049, # I
2324 wakaba 1.12 ]->[length $self->{kwd}] or
2325 wakaba 1.1 $self->{nc} == [
2326     undef,
2327     0x0075, # u
2328     0x0062, # b
2329     0x006C, # l
2330     0x0069, # i
2331 wakaba 1.12 ]->[length $self->{kwd}]) {
2332 wakaba 1.1 !!!cp (175);
2333     ## Stay in the state.
2334 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2335 wakaba 1.1 !!!next-input-character;
2336     redo A;
2337 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2338 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2339     $self->{nc} == 0x0063)) { # c
2340 wakaba 1.12 if ($self->{is_xml} and
2341     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2342     !!!cp (168.1);
2343     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2344     text => 'PUBLIC',
2345     line => $self->{line_prev},
2346     column => $self->{column_prev} - 4);
2347     } else {
2348     !!!cp (168);
2349     }
2350 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2351     !!!next-input-character;
2352     redo A;
2353     } else {
2354 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2355 wakaba 1.1 line => $self->{line_prev},
2356 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2357 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2358     !!!cp (169);
2359     $self->{ct}->{quirks} = 1;
2360     $self->{state} = BOGUS_DOCTYPE_STATE;
2361     } else {
2362     !!!cp (169.1);
2363     $self->{state} = BOGUS_MD_STATE;
2364     }
2365 wakaba 1.1 ## Reconsume.
2366     redo A;
2367     }
2368     } elsif ($self->{state} == SYSTEM_STATE) {
2369     ## ASCII case-insensitive
2370     if ($self->{nc} == [
2371     undef,
2372     0x0059, # Y
2373     0x0053, # S
2374     0x0054, # T
2375     0x0045, # E
2376 wakaba 1.12 ]->[length $self->{kwd}] or
2377 wakaba 1.1 $self->{nc} == [
2378     undef,
2379     0x0079, # y
2380     0x0073, # s
2381     0x0074, # t
2382     0x0065, # e
2383 wakaba 1.12 ]->[length $self->{kwd}]) {
2384 wakaba 1.1 !!!cp (170);
2385     ## Stay in the state.
2386 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2387 wakaba 1.1 !!!next-input-character;
2388     redo A;
2389 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2390 wakaba 1.1 ($self->{nc} == 0x004D or # M
2391     $self->{nc} == 0x006D)) { # m
2392 wakaba 1.12 if ($self->{is_xml} and
2393     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2394     !!!cp (171.1);
2395     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2396     text => 'SYSTEM',
2397     line => $self->{line_prev},
2398     column => $self->{column_prev} - 4);
2399     } else {
2400     !!!cp (171);
2401     }
2402 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2403     !!!next-input-character;
2404     redo A;
2405     } else {
2406 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2407 wakaba 1.1 line => $self->{line_prev},
2408 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2409 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2410     !!!cp (172);
2411     $self->{ct}->{quirks} = 1;
2412     $self->{state} = BOGUS_DOCTYPE_STATE;
2413     } else {
2414     !!!cp (172.1);
2415     $self->{state} = BOGUS_MD_STATE;
2416     }
2417 wakaba 1.1 ## Reconsume.
2418     redo A;
2419     }
2420     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2421     if ($is_space->{$self->{nc}}) {
2422     !!!cp (181);
2423     ## Stay in the state
2424     !!!next-input-character;
2425     redo A;
2426     } elsif ($self->{nc} eq 0x0022) { # "
2427     !!!cp (182);
2428     $self->{ct}->{pubid} = ''; # DOCTYPE
2429     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2430     !!!next-input-character;
2431     redo A;
2432     } elsif ($self->{nc} eq 0x0027) { # '
2433     !!!cp (183);
2434     $self->{ct}->{pubid} = ''; # DOCTYPE
2435     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2436     !!!next-input-character;
2437     redo A;
2438     } elsif ($self->{nc} eq 0x003E) { # >
2439     !!!parse-error (type => 'no PUBLIC literal');
2440 wakaba 1.16
2441     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2442     !!!cp (184);
2443     $self->{state} = DATA_STATE;
2444     $self->{s_kwd} = '';
2445     $self->{ct}->{quirks} = 1;
2446     } else {
2447     !!!cp (184.1);
2448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2449     }
2450    
2451 wakaba 1.1 !!!next-input-character;
2452 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2453 wakaba 1.1 redo A;
2454     } elsif ($self->{nc} == -1) {
2455 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2456     !!!cp (185);
2457     !!!parse-error (type => 'unclosed DOCTYPE');
2458     $self->{state} = DATA_STATE;
2459     $self->{s_kwd} = '';
2460     $self->{ct}->{quirks} = 1;
2461     } else {
2462     !!!cp (185.1);
2463     !!!parse-error (type => 'unclosed md'); ## TODO: type
2464     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2465     }
2466    
2467 wakaba 1.1 ## reconsume
2468     !!!emit ($self->{ct}); # DOCTYPE
2469     redo A;
2470 wakaba 1.16 } elsif ($self->{is_xml} and
2471     $self->{ct}->{type} == DOCTYPE_TOKEN and
2472     $self->{nc} == 0x005B) { # [
2473 wakaba 1.12 !!!cp (186.1);
2474     !!!parse-error (type => 'no PUBLIC literal');
2475     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2476     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2477 wakaba 1.13 $self->{in_subset} = 1;
2478 wakaba 1.12 !!!next-input-character;
2479 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2480 wakaba 1.12 redo A;
2481 wakaba 1.1 } else {
2482     !!!parse-error (type => 'string after PUBLIC');
2483    
2484 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2485     !!!cp (186);
2486     $self->{ct}->{quirks} = 1;
2487     $self->{state} = BOGUS_DOCTYPE_STATE;
2488     } else {
2489     !!!cp (186.2);
2490     $self->{state} = BOGUS_MD_STATE;
2491     }
2492    
2493 wakaba 1.1 !!!next-input-character;
2494     redo A;
2495     }
2496     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2497     if ($self->{nc} == 0x0022) { # "
2498     !!!cp (187);
2499     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2500     !!!next-input-character;
2501     redo A;
2502     } elsif ($self->{nc} == 0x003E) { # >
2503     !!!parse-error (type => 'unclosed PUBLIC literal');
2504    
2505 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2506     !!!cp (188);
2507     $self->{state} = DATA_STATE;
2508     $self->{s_kwd} = '';
2509     $self->{ct}->{quirks} = 1;
2510     } else {
2511     !!!cp (188.1);
2512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2513     }
2514    
2515 wakaba 1.1 !!!next-input-character;
2516 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2517 wakaba 1.1 redo A;
2518     } elsif ($self->{nc} == -1) {
2519     !!!parse-error (type => 'unclosed PUBLIC literal');
2520    
2521 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2522     !!!cp (189);
2523     $self->{state} = DATA_STATE;
2524     $self->{s_kwd} = '';
2525     $self->{ct}->{quirks} = 1;
2526     } else {
2527     !!!cp (189.1);
2528     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2529     }
2530    
2531     ## Reconsume.
2532 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2533     redo A;
2534     } else {
2535     !!!cp (190);
2536 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2537 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2538     length $self->{ct}->{pubid});
2539    
2540     ## Stay in the state
2541     !!!next-input-character;
2542     redo A;
2543     }
2544     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2545     if ($self->{nc} == 0x0027) { # '
2546     !!!cp (191);
2547     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2548     !!!next-input-character;
2549     redo A;
2550     } elsif ($self->{nc} == 0x003E) { # >
2551     !!!parse-error (type => 'unclosed PUBLIC literal');
2552    
2553 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2554     !!!cp (192);
2555     $self->{state} = DATA_STATE;
2556     $self->{s_kwd} = '';
2557     $self->{ct}->{quirks} = 1;
2558     } else {
2559     !!!cp (192.1);
2560     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2561     }
2562    
2563 wakaba 1.1 !!!next-input-character;
2564 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2565 wakaba 1.1 redo A;
2566     } elsif ($self->{nc} == -1) {
2567     !!!parse-error (type => 'unclosed PUBLIC literal');
2568    
2569 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2570     !!!cp (193);
2571     $self->{state} = DATA_STATE;
2572     $self->{s_kwd} = '';
2573     $self->{ct}->{quirks} = 1;
2574     } else {
2575     !!!cp (193.1);
2576     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2577     }
2578    
2579 wakaba 1.1 ## reconsume
2580 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2581 wakaba 1.1 redo A;
2582     } else {
2583     !!!cp (194);
2584 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2585 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2586     length $self->{ct}->{pubid});
2587    
2588     ## Stay in the state
2589     !!!next-input-character;
2590     redo A;
2591     }
2592     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2593     if ($is_space->{$self->{nc}}) {
2594     !!!cp (195);
2595     ## Stay in the state
2596     !!!next-input-character;
2597     redo A;
2598     } elsif ($self->{nc} == 0x0022) { # "
2599     !!!cp (196);
2600 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2601 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2602     !!!next-input-character;
2603     redo A;
2604     } elsif ($self->{nc} == 0x0027) { # '
2605     !!!cp (197);
2606 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2607 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2608     !!!next-input-character;
2609     redo A;
2610     } elsif ($self->{nc} == 0x003E) { # >
2611 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2612     if ($self->{is_xml}) {
2613     !!!cp (198.1);
2614     !!!parse-error (type => 'no SYSTEM literal');
2615     } else {
2616     !!!cp (198);
2617     }
2618     $self->{state} = DATA_STATE;
2619     $self->{s_kwd} = '';
2620 wakaba 1.12 } else {
2621 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2622     !!!cp (198.2);
2623     } else {
2624     !!!cp (198.3);
2625     !!!parse-error (type => 'no SYSTEM literal');
2626     }
2627     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2628 wakaba 1.12 }
2629 wakaba 1.16
2630 wakaba 1.1 !!!next-input-character;
2631 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2632 wakaba 1.1 redo A;
2633     } elsif ($self->{nc} == -1) {
2634 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2635     !!!cp (199);
2636     !!!parse-error (type => 'unclosed DOCTYPE');
2637    
2638     $self->{state} = DATA_STATE;
2639     $self->{s_kwd} = '';
2640     $self->{ct}->{quirks} = 1;
2641     } else {
2642     !!!parse-error (type => 'unclosed md'); ## TODO: type
2643     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2644     }
2645    
2646 wakaba 1.1 ## reconsume
2647 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2648 wakaba 1.1 redo A;
2649 wakaba 1.16 } elsif ($self->{is_xml} and
2650     $self->{ct}->{type} == DOCTYPE_TOKEN and
2651     $self->{nc} == 0x005B) { # [
2652 wakaba 1.12 !!!cp (200.1);
2653     !!!parse-error (type => 'no SYSTEM literal');
2654     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2656 wakaba 1.13 $self->{in_subset} = 1;
2657 wakaba 1.12 !!!next-input-character;
2658 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2659 wakaba 1.12 redo A;
2660 wakaba 1.1 } else {
2661     !!!parse-error (type => 'string after PUBLIC literal');
2662    
2663 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2664     !!!cp (200);
2665     $self->{ct}->{quirks} = 1;
2666     $self->{state} = BOGUS_DOCTYPE_STATE;
2667     } else {
2668     !!!cp (200.2);
2669     $self->{state} = BOGUS_MD_STATE;
2670     }
2671    
2672 wakaba 1.1 !!!next-input-character;
2673     redo A;
2674     }
2675     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2676     if ($is_space->{$self->{nc}}) {
2677     !!!cp (201);
2678     ## Stay in the state
2679     !!!next-input-character;
2680     redo A;
2681     } elsif ($self->{nc} == 0x0022) { # "
2682     !!!cp (202);
2683     $self->{ct}->{sysid} = ''; # DOCTYPE
2684     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2685     !!!next-input-character;
2686     redo A;
2687     } elsif ($self->{nc} == 0x0027) { # '
2688     !!!cp (203);
2689     $self->{ct}->{sysid} = ''; # DOCTYPE
2690     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2691     !!!next-input-character;
2692     redo A;
2693     } elsif ($self->{nc} == 0x003E) { # >
2694     !!!parse-error (type => 'no SYSTEM literal');
2695     !!!next-input-character;
2696    
2697 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2698     !!!cp (204);
2699     $self->{state} = DATA_STATE;
2700     $self->{s_kwd} = '';
2701     $self->{ct}->{quirks} = 1;
2702     } else {
2703     !!!cp (204.1);
2704     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2705     }
2706 wakaba 1.1
2707 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2708 wakaba 1.1 redo A;
2709     } elsif ($self->{nc} == -1) {
2710 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2711     !!!cp (205);
2712     !!!parse-error (type => 'unclosed DOCTYPE');
2713     $self->{state} = DATA_STATE;
2714     $self->{s_kwd} = '';
2715     $self->{ct}->{quirks} = 1;
2716     } else {
2717     !!!cp (205.1);
2718     !!!parse-error (type => 'unclosed md'); ## TODO: type
2719     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2720     }
2721    
2722 wakaba 1.1 ## reconsume
2723 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2724 wakaba 1.1 redo A;
2725 wakaba 1.16 } elsif ($self->{is_xml} and
2726     $self->{ct}->{type} == DOCTYPE_TOKEN and
2727     $self->{nc} == 0x005B) { # [
2728 wakaba 1.12 !!!cp (206.1);
2729     !!!parse-error (type => 'no SYSTEM literal');
2730    
2731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2733 wakaba 1.13 $self->{in_subset} = 1;
2734 wakaba 1.12 !!!next-input-character;
2735 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2736 wakaba 1.12 redo A;
2737 wakaba 1.1 } else {
2738     !!!parse-error (type => 'string after SYSTEM');
2739    
2740 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2741     !!!cp (206);
2742     $self->{ct}->{quirks} = 1;
2743     $self->{state} = BOGUS_DOCTYPE_STATE;
2744     } else {
2745     !!!cp (206.2);
2746     $self->{state} = BOGUS_MD_STATE;
2747     }
2748    
2749 wakaba 1.1 !!!next-input-character;
2750     redo A;
2751     }
2752     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2753     if ($self->{nc} == 0x0022) { # "
2754     !!!cp (207);
2755     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2756     !!!next-input-character;
2757     redo A;
2758 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2759 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2760    
2761 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2762     !!!cp (208);
2763     $self->{state} = DATA_STATE;
2764     $self->{s_kwd} = '';
2765     $self->{ct}->{quirks} = 1;
2766     } else {
2767     !!!cp (208.1);
2768     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2769     }
2770    
2771 wakaba 1.1 !!!next-input-character;
2772 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2773 wakaba 1.1 redo A;
2774     } elsif ($self->{nc} == -1) {
2775     !!!parse-error (type => 'unclosed SYSTEM literal');
2776    
2777 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2778     !!!cp (209);
2779     $self->{state} = DATA_STATE;
2780     $self->{s_kwd} = '';
2781     $self->{ct}->{quirks} = 1;
2782     } else {
2783     !!!cp (209.1);
2784     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2785     }
2786    
2787 wakaba 1.1 ## reconsume
2788 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2789 wakaba 1.1 redo A;
2790     } else {
2791     !!!cp (210);
2792 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2793 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2794     length $self->{ct}->{sysid});
2795    
2796     ## Stay in the state
2797     !!!next-input-character;
2798     redo A;
2799     }
2800     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2801     if ($self->{nc} == 0x0027) { # '
2802     !!!cp (211);
2803     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2804     !!!next-input-character;
2805     redo A;
2806 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2807 wakaba 1.1 !!!cp (212);
2808     !!!parse-error (type => 'unclosed SYSTEM literal');
2809    
2810     $self->{state} = DATA_STATE;
2811 wakaba 1.5 $self->{s_kwd} = '';
2812 wakaba 1.1 !!!next-input-character;
2813    
2814     $self->{ct}->{quirks} = 1;
2815     !!!emit ($self->{ct}); # DOCTYPE
2816    
2817     redo A;
2818     } elsif ($self->{nc} == -1) {
2819     !!!parse-error (type => 'unclosed SYSTEM literal');
2820    
2821 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2822     !!!cp (213);
2823     $self->{state} = DATA_STATE;
2824     $self->{s_kwd} = '';
2825     $self->{ct}->{quirks} = 1;
2826     } else {
2827     !!!cp (213.1);
2828     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2829     }
2830    
2831 wakaba 1.1 ## reconsume
2832 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2833 wakaba 1.1 redo A;
2834     } else {
2835     !!!cp (214);
2836 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2837 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2838     length $self->{ct}->{sysid});
2839    
2840     ## Stay in the state
2841     !!!next-input-character;
2842     redo A;
2843     }
2844     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2845     if ($is_space->{$self->{nc}}) {
2846 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2847     !!!cp (215.1);
2848     $self->{state} = BEFORE_NDATA_STATE;
2849     } else {
2850     !!!cp (215);
2851     ## Stay in the state
2852     }
2853 wakaba 1.1 !!!next-input-character;
2854     redo A;
2855     } elsif ($self->{nc} == 0x003E) { # >
2856 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2857     !!!cp (216);
2858     $self->{state} = DATA_STATE;
2859     $self->{s_kwd} = '';
2860     } else {
2861     !!!cp (216.1);
2862     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863     }
2864    
2865 wakaba 1.1 !!!next-input-character;
2866 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867 wakaba 1.1 redo A;
2868 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2869     ($self->{nc} == 0x004E or # N
2870     $self->{nc} == 0x006E)) { # n
2871     !!!cp (216.2);
2872     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2873     $self->{state} = NDATA_STATE;
2874     $self->{kwd} = chr $self->{nc};
2875     !!!next-input-character;
2876     redo A;
2877 wakaba 1.1 } elsif ($self->{nc} == -1) {
2878 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879     !!!cp (217);
2880     !!!parse-error (type => 'unclosed DOCTYPE');
2881     $self->{state} = DATA_STATE;
2882     $self->{s_kwd} = '';
2883     $self->{ct}->{quirks} = 1;
2884     } else {
2885     !!!cp (217.1);
2886     !!!parse-error (type => 'unclosed md'); ## TODO: type
2887     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2888     }
2889    
2890 wakaba 1.1 ## reconsume
2891 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2892 wakaba 1.1 redo A;
2893 wakaba 1.16 } elsif ($self->{is_xml} and
2894     $self->{ct}->{type} == DOCTYPE_TOKEN and
2895     $self->{nc} == 0x005B) { # [
2896 wakaba 1.12 !!!cp (218.1);
2897     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2898     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2899 wakaba 1.13 $self->{in_subset} = 1;
2900 wakaba 1.12 !!!next-input-character;
2901 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2902 wakaba 1.12 redo A;
2903 wakaba 1.1 } else {
2904     !!!parse-error (type => 'string after SYSTEM literal');
2905    
2906 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2907     !!!cp (218);
2908     #$self->{ct}->{quirks} = 1;
2909     $self->{state} = BOGUS_DOCTYPE_STATE;
2910     } else {
2911     !!!cp (218.2);
2912     $self->{state} = BOGUS_MD_STATE;
2913     }
2914    
2915 wakaba 1.1 !!!next-input-character;
2916     redo A;
2917     }
2918 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2919     if ($is_space->{$self->{nc}}) {
2920     !!!cp (218.3);
2921     ## Stay in the state.
2922     !!!next-input-character;
2923     redo A;
2924     } elsif ($self->{nc} == 0x003E) { # >
2925     !!!cp (218.4);
2926     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2927     !!!next-input-character;
2928     !!!emit ($self->{ct}); # ENTITY
2929     redo A;
2930     } elsif ($self->{nc} == 0x004E or # N
2931     $self->{nc} == 0x006E) { # n
2932     !!!cp (218.5);
2933     $self->{state} = NDATA_STATE;
2934     $self->{kwd} = chr $self->{nc};
2935     !!!next-input-character;
2936     redo A;
2937     } elsif ($self->{nc} == -1) {
2938     !!!cp (218.6);
2939     !!!parse-error (type => 'unclosed md'); ## TODO: type
2940     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941     ## reconsume
2942     !!!emit ($self->{ct}); # ENTITY
2943     redo A;
2944     } else {
2945     !!!cp (218.7);
2946     !!!parse-error (type => 'string after SYSTEM literal');
2947     $self->{state} = BOGUS_MD_STATE;
2948     !!!next-input-character;
2949     redo A;
2950     }
2951 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2952     if ($self->{nc} == 0x003E) { # >
2953     !!!cp (219);
2954     $self->{state} = DATA_STATE;
2955 wakaba 1.5 $self->{s_kwd} = '';
2956 wakaba 1.1 !!!next-input-character;
2957    
2958     !!!emit ($self->{ct}); # DOCTYPE
2959    
2960     redo A;
2961 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2962 wakaba 1.13 !!!cp (220.1);
2963     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2964     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2965     $self->{in_subset} = 1;
2966     !!!next-input-character;
2967     !!!emit ($self->{ct}); # DOCTYPE
2968     redo A;
2969 wakaba 1.1 } elsif ($self->{nc} == -1) {
2970     !!!cp (220);
2971     $self->{state} = DATA_STATE;
2972 wakaba 1.5 $self->{s_kwd} = '';
2973 wakaba 1.1 ## reconsume
2974    
2975     !!!emit ($self->{ct}); # DOCTYPE
2976    
2977     redo A;
2978     } else {
2979     !!!cp (221);
2980     my $s = '';
2981 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2982 wakaba 1.1
2983     ## Stay in the state
2984     !!!next-input-character;
2985     redo A;
2986     }
2987     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2988     ## NOTE: "CDATA section state" in the state is jointly implemented
2989     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2990     ## and |CDATA_SECTION_MSE2_STATE|.
2991 wakaba 1.10
2992     ## XML5: "CDATA state".
2993 wakaba 1.1
2994     if ($self->{nc} == 0x005D) { # ]
2995     !!!cp (221.1);
2996     $self->{state} = CDATA_SECTION_MSE1_STATE;
2997     !!!next-input-character;
2998     redo A;
2999     } elsif ($self->{nc} == -1) {
3000 wakaba 1.6 if ($self->{is_xml}) {
3001 wakaba 1.8 !!!cp (221.11);
3002 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3003 wakaba 1.8 } else {
3004     !!!cp (221.12);
3005 wakaba 1.6 }
3006    
3007 wakaba 1.1 $self->{state} = DATA_STATE;
3008 wakaba 1.5 $self->{s_kwd} = '';
3009 wakaba 1.10 ## Reconsume.
3010 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3011     !!!cp (221.2);
3012     !!!emit ($self->{ct}); # character
3013     } else {
3014     !!!cp (221.3);
3015     ## No token to emit. $self->{ct} is discarded.
3016     }
3017     redo A;
3018     } else {
3019     !!!cp (221.4);
3020     $self->{ct}->{data} .= chr $self->{nc};
3021     $self->{read_until}->($self->{ct}->{data},
3022     q<]>,
3023     length $self->{ct}->{data});
3024    
3025     ## Stay in the state.
3026     !!!next-input-character;
3027     redo A;
3028     }
3029    
3030     ## ISSUE: "text tokens" in spec.
3031     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3032 wakaba 1.10 ## XML5: "CDATA bracket state".
3033    
3034 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3035     !!!cp (221.5);
3036     $self->{state} = CDATA_SECTION_MSE2_STATE;
3037     !!!next-input-character;
3038     redo A;
3039     } else {
3040     !!!cp (221.6);
3041 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3042 wakaba 1.1 $self->{ct}->{data} .= ']';
3043 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3044 wakaba 1.1 ## Reconsume.
3045     redo A;
3046     }
3047     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3048 wakaba 1.10 ## XML5: "CDATA end state".
3049    
3050 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3051     $self->{state} = DATA_STATE;
3052 wakaba 1.5 $self->{s_kwd} = '';
3053 wakaba 1.1 !!!next-input-character;
3054     if (length $self->{ct}->{data}) { # character
3055     !!!cp (221.7);
3056     !!!emit ($self->{ct}); # character
3057     } else {
3058     !!!cp (221.8);
3059     ## No token to emit. $self->{ct} is discarded.
3060     }
3061     redo A;
3062     } elsif ($self->{nc} == 0x005D) { # ]
3063     !!!cp (221.9); # character
3064     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3065     ## Stay in the state.
3066     !!!next-input-character;
3067     redo A;
3068     } else {
3069     !!!cp (221.11);
3070     $self->{ct}->{data} .= ']]'; # character
3071     $self->{state} = CDATA_SECTION_STATE;
3072 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3073 wakaba 1.1 redo A;
3074     }
3075     } elsif ($self->{state} == ENTITY_STATE) {
3076     if ($is_space->{$self->{nc}} or
3077     {
3078     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3079     $self->{entity_add} => 1,
3080     }->{$self->{nc}}) {
3081 wakaba 1.22 if ($self->{is_xml}) {
3082     !!!cp (1001.1);
3083     !!!parse-error (type => 'bare ero',
3084     line => $self->{line_prev},
3085     column => $self->{column_prev}
3086     + ($self->{nc} == -1 ? 1 : 0));
3087     } else {
3088     !!!cp (1001);
3089     ## No error
3090     }
3091 wakaba 1.1 ## Don't consume
3092     ## Return nothing.
3093     #
3094     } elsif ($self->{nc} == 0x0023) { # #
3095     !!!cp (999);
3096     $self->{state} = ENTITY_HASH_STATE;
3097 wakaba 1.12 $self->{kwd} = '#';
3098 wakaba 1.1 !!!next-input-character;
3099     redo A;
3100 wakaba 1.22 } elsif ($self->{is_xml} or
3101     (0x0041 <= $self->{nc} and
3102 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3103     (0x0061 <= $self->{nc} and
3104     $self->{nc} <= 0x007A)) { # a..z
3105     !!!cp (998);
3106     require Whatpm::_NamedEntityList;
3107     $self->{state} = ENTITY_NAME_STATE;
3108 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3109     $self->{entity__value} = $self->{kwd};
3110 wakaba 1.1 $self->{entity__match} = 0;
3111     !!!next-input-character;
3112     redo A;
3113     } else {
3114     !!!cp (1027);
3115     !!!parse-error (type => 'bare ero');
3116     ## Return nothing.
3117     #
3118     }
3119    
3120     ## NOTE: No character is consumed by the "consume a character
3121     ## reference" algorithm. In other word, there is an "&" character
3122     ## that does not introduce a character reference, which would be
3123     ## appended to the parent element or the attribute value in later
3124     ## process of the tokenizer.
3125    
3126     if ($self->{prev_state} == DATA_STATE) {
3127     !!!cp (997);
3128     $self->{state} = $self->{prev_state};
3129 wakaba 1.5 $self->{s_kwd} = '';
3130 wakaba 1.1 ## Reconsume.
3131     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3132     line => $self->{line_prev},
3133     column => $self->{column_prev},
3134     });
3135     redo A;
3136     } else {
3137     !!!cp (996);
3138     $self->{ca}->{value} .= '&';
3139     $self->{state} = $self->{prev_state};
3140 wakaba 1.5 $self->{s_kwd} = '';
3141 wakaba 1.1 ## Reconsume.
3142     redo A;
3143     }
3144     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3145 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3146 wakaba 1.1 !!!cp (995);
3147     $self->{state} = HEXREF_X_STATE;
3148 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3149 wakaba 1.1 !!!next-input-character;
3150     redo A;
3151 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3152     !!!cp (995.1);
3153     if ($self->{is_xml}) {
3154     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3155     }
3156     $self->{state} = HEXREF_X_STATE;
3157     $self->{kwd} .= chr $self->{nc};
3158     !!!next-input-character;
3159     redo A;
3160 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3161     $self->{nc} <= 0x0039) { # 0..9
3162     !!!cp (994);
3163     $self->{state} = NCR_NUM_STATE;
3164 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3165 wakaba 1.1 !!!next-input-character;
3166     redo A;
3167     } else {
3168     !!!parse-error (type => 'bare nero',
3169     line => $self->{line_prev},
3170     column => $self->{column_prev} - 1);
3171    
3172     ## NOTE: According to the spec algorithm, nothing is returned,
3173     ## and then "&#" is appended to the parent element or the attribute
3174     ## value in the later processing.
3175    
3176     if ($self->{prev_state} == DATA_STATE) {
3177     !!!cp (1019);
3178     $self->{state} = $self->{prev_state};
3179 wakaba 1.5 $self->{s_kwd} = '';
3180 wakaba 1.1 ## Reconsume.
3181     !!!emit ({type => CHARACTER_TOKEN,
3182     data => '&#',
3183     line => $self->{line_prev},
3184     column => $self->{column_prev} - 1,
3185     });
3186     redo A;
3187     } else {
3188     !!!cp (993);
3189     $self->{ca}->{value} .= '&#';
3190     $self->{state} = $self->{prev_state};
3191 wakaba 1.5 $self->{s_kwd} = '';
3192 wakaba 1.1 ## Reconsume.
3193     redo A;
3194     }
3195     }
3196     } elsif ($self->{state} == NCR_NUM_STATE) {
3197     if (0x0030 <= $self->{nc} and
3198     $self->{nc} <= 0x0039) { # 0..9
3199     !!!cp (1012);
3200 wakaba 1.12 $self->{kwd} *= 10;
3201     $self->{kwd} += $self->{nc} - 0x0030;
3202 wakaba 1.1
3203     ## Stay in the state.
3204     !!!next-input-character;
3205     redo A;
3206     } elsif ($self->{nc} == 0x003B) { # ;
3207     !!!cp (1013);
3208     !!!next-input-character;
3209     #
3210     } else {
3211     !!!cp (1014);
3212     !!!parse-error (type => 'no refc');
3213     ## Reconsume.
3214     #
3215     }
3216    
3217 wakaba 1.12 my $code = $self->{kwd};
3218 wakaba 1.1 my $l = $self->{line_prev};
3219     my $c = $self->{column_prev};
3220     if ($charref_map->{$code}) {
3221     !!!cp (1015);
3222     !!!parse-error (type => 'invalid character reference',
3223     text => (sprintf 'U+%04X', $code),
3224     line => $l, column => $c);
3225     $code = $charref_map->{$code};
3226     } elsif ($code > 0x10FFFF) {
3227     !!!cp (1016);
3228     !!!parse-error (type => 'invalid character reference',
3229     text => (sprintf 'U-%08X', $code),
3230     line => $l, column => $c);
3231     $code = 0xFFFD;
3232     }
3233    
3234     if ($self->{prev_state} == DATA_STATE) {
3235     !!!cp (992);
3236     $self->{state} = $self->{prev_state};
3237 wakaba 1.5 $self->{s_kwd} = '';
3238 wakaba 1.1 ## Reconsume.
3239     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3240 wakaba 1.7 has_reference => 1,
3241 wakaba 1.1 line => $l, column => $c,
3242     });
3243     redo A;
3244     } else {
3245     !!!cp (991);
3246     $self->{ca}->{value} .= chr $code;
3247     $self->{ca}->{has_reference} = 1;
3248     $self->{state} = $self->{prev_state};
3249 wakaba 1.5 $self->{s_kwd} = '';
3250 wakaba 1.1 ## Reconsume.
3251     redo A;
3252     }
3253     } elsif ($self->{state} == HEXREF_X_STATE) {
3254     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3255     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3256     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3257     # 0..9, A..F, a..f
3258     !!!cp (990);
3259     $self->{state} = HEXREF_HEX_STATE;
3260 wakaba 1.12 $self->{kwd} = 0;
3261 wakaba 1.1 ## Reconsume.
3262     redo A;
3263     } else {
3264     !!!parse-error (type => 'bare hcro',
3265     line => $self->{line_prev},
3266     column => $self->{column_prev} - 2);
3267    
3268     ## NOTE: According to the spec algorithm, nothing is returned,
3269     ## and then "&#" followed by "X" or "x" is appended to the parent
3270     ## element or the attribute value in the later processing.
3271    
3272     if ($self->{prev_state} == DATA_STATE) {
3273     !!!cp (1005);
3274     $self->{state} = $self->{prev_state};
3275 wakaba 1.5 $self->{s_kwd} = '';
3276 wakaba 1.1 ## Reconsume.
3277     !!!emit ({type => CHARACTER_TOKEN,
3278 wakaba 1.12 data => '&' . $self->{kwd},
3279 wakaba 1.1 line => $self->{line_prev},
3280 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3281 wakaba 1.1 });
3282     redo A;
3283     } else {
3284     !!!cp (989);
3285 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3286 wakaba 1.1 $self->{state} = $self->{prev_state};
3287 wakaba 1.5 $self->{s_kwd} = '';
3288 wakaba 1.1 ## Reconsume.
3289     redo A;
3290     }
3291     }
3292     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3293     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3294     # 0..9
3295     !!!cp (1002);
3296 wakaba 1.12 $self->{kwd} *= 0x10;
3297     $self->{kwd} += $self->{nc} - 0x0030;
3298 wakaba 1.1 ## Stay in the state.
3299     !!!next-input-character;
3300     redo A;
3301     } elsif (0x0061 <= $self->{nc} and
3302     $self->{nc} <= 0x0066) { # a..f
3303     !!!cp (1003);
3304 wakaba 1.12 $self->{kwd} *= 0x10;
3305     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3306 wakaba 1.1 ## Stay in the state.
3307     !!!next-input-character;
3308     redo A;
3309     } elsif (0x0041 <= $self->{nc} and
3310     $self->{nc} <= 0x0046) { # A..F
3311     !!!cp (1004);
3312 wakaba 1.12 $self->{kwd} *= 0x10;
3313     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3314 wakaba 1.1 ## Stay in the state.
3315     !!!next-input-character;
3316     redo A;
3317     } elsif ($self->{nc} == 0x003B) { # ;
3318     !!!cp (1006);
3319     !!!next-input-character;
3320     #
3321     } else {
3322     !!!cp (1007);
3323     !!!parse-error (type => 'no refc',
3324     line => $self->{line},
3325     column => $self->{column});
3326     ## Reconsume.
3327     #
3328     }
3329    
3330 wakaba 1.12 my $code = $self->{kwd};
3331 wakaba 1.1 my $l = $self->{line_prev};
3332     my $c = $self->{column_prev};
3333     if ($charref_map->{$code}) {
3334     !!!cp (1008);
3335     !!!parse-error (type => 'invalid character reference',
3336     text => (sprintf 'U+%04X', $code),
3337     line => $l, column => $c);
3338     $code = $charref_map->{$code};
3339     } elsif ($code > 0x10FFFF) {
3340     !!!cp (1009);
3341     !!!parse-error (type => 'invalid character reference',
3342     text => (sprintf 'U-%08X', $code),
3343     line => $l, column => $c);
3344     $code = 0xFFFD;
3345     }
3346    
3347     if ($self->{prev_state} == DATA_STATE) {
3348     !!!cp (988);
3349     $self->{state} = $self->{prev_state};
3350 wakaba 1.5 $self->{s_kwd} = '';
3351 wakaba 1.1 ## Reconsume.
3352     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3353 wakaba 1.7 has_reference => 1,
3354 wakaba 1.1 line => $l, column => $c,
3355     });
3356     redo A;
3357     } else {
3358     !!!cp (987);
3359     $self->{ca}->{value} .= chr $code;
3360     $self->{ca}->{has_reference} = 1;
3361     $self->{state} = $self->{prev_state};
3362 wakaba 1.5 $self->{s_kwd} = '';
3363 wakaba 1.1 ## Reconsume.
3364     redo A;
3365     }
3366     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3367 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3368     $self->{nc} <= 0x005A) or # x
3369     (0x0061 <= $self->{nc} and # a
3370     $self->{nc} <= 0x007A) or # z
3371     (0x0030 <= $self->{nc} and # 0
3372     $self->{nc} <= 0x0039) or # 9
3373 wakaba 1.22 $self->{nc} == 0x003B or # ;
3374     ($self->{is_xml} and
3375     not ($is_space->{$self->{nc}} or
3376     {
3377     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3378     $self->{entity_add} => 1,
3379     }->{$self->{nc}}))) {
3380 wakaba 1.1 our $EntityChar;
3381 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3382 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3383     $self->{ge}->{$self->{kwd}}) {
3384 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3385 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3386     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3387     !!!cp (1020.1);
3388     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3389     } else {
3390     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3391     !!!cp (1020.2);
3392     !!!parse-error (type => 'unparsed entity', ## TODO: type
3393     value => $self->{kwd});
3394     } else {
3395     !!!cp (1020.3);
3396     }
3397     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3398     }
3399     } else {
3400     if ($self->{is_xml}) {
3401     !!!cp (1020.4);
3402     !!!parse-error (type => 'entity not declared', ## TODO: type
3403     value => $self->{kwd},
3404     level => {
3405     'amp;' => $self->{level}->{warn},
3406     'quot;' => $self->{level}->{warn},
3407     'lt;' => $self->{level}->{warn},
3408     'gt;' => $self->{level}->{warn},
3409     'apos;' => $self->{level}->{warn},
3410     }->{$self->{kwd}} ||
3411     $self->{level}->{must});
3412     } else {
3413     !!!cp (1020);
3414     }
3415     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3416     }
3417 wakaba 1.1 $self->{entity__match} = 1;
3418     !!!next-input-character;
3419     #
3420     } else {
3421     !!!cp (1021);
3422 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3423 wakaba 1.1 $self->{entity__match} = -1;
3424     ## Stay in the state.
3425     !!!next-input-character;
3426     redo A;
3427     }
3428     } else {
3429     !!!cp (1022);
3430     $self->{entity__value} .= chr $self->{nc};
3431     $self->{entity__match} *= 2;
3432     ## Stay in the state.
3433     !!!next-input-character;
3434     redo A;
3435     }
3436     }
3437    
3438     my $data;
3439     my $has_ref;
3440     if ($self->{entity__match} > 0) {
3441     !!!cp (1023);
3442     $data = $self->{entity__value};
3443     $has_ref = 1;
3444     #
3445     } elsif ($self->{entity__match} < 0) {
3446     !!!parse-error (type => 'no refc');
3447     if ($self->{prev_state} != DATA_STATE and # in attribute
3448     $self->{entity__match} < -1) {
3449     !!!cp (1024);
3450 wakaba 1.12 $data = '&' . $self->{kwd};
3451 wakaba 1.1 #
3452     } else {
3453     !!!cp (1025);
3454     $data = $self->{entity__value};
3455     $has_ref = 1;
3456     #
3457     }
3458     } else {
3459     !!!cp (1026);
3460     !!!parse-error (type => 'bare ero',
3461     line => $self->{line_prev},
3462 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3463     $data = '&' . $self->{kwd};
3464 wakaba 1.1 #
3465     }
3466    
3467     ## NOTE: In these cases, when a character reference is found,
3468     ## it is consumed and a character token is returned, or, otherwise,
3469     ## nothing is consumed and returned, according to the spec algorithm.
3470     ## In this implementation, anything that has been examined by the
3471     ## tokenizer is appended to the parent element or the attribute value
3472     ## as string, either literal string when no character reference or
3473     ## entity-replaced string otherwise, in this stage, since any characters
3474     ## that would not be consumed are appended in the data state or in an
3475     ## appropriate attribute value state anyway.
3476    
3477     if ($self->{prev_state} == DATA_STATE) {
3478     !!!cp (986);
3479     $self->{state} = $self->{prev_state};
3480 wakaba 1.5 $self->{s_kwd} = '';
3481 wakaba 1.1 ## Reconsume.
3482     !!!emit ({type => CHARACTER_TOKEN,
3483     data => $data,
3484 wakaba 1.7 has_reference => $has_ref,
3485 wakaba 1.1 line => $self->{line_prev},
3486 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3487 wakaba 1.1 });
3488     redo A;
3489     } else {
3490     !!!cp (985);
3491     $self->{ca}->{value} .= $data;
3492     $self->{ca}->{has_reference} = 1 if $has_ref;
3493     $self->{state} = $self->{prev_state};
3494 wakaba 1.5 $self->{s_kwd} = '';
3495 wakaba 1.1 ## Reconsume.
3496     redo A;
3497     }
3498 wakaba 1.8
3499     ## XML-only states
3500    
3501     } elsif ($self->{state} == PI_STATE) {
3502 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3503    
3504 wakaba 1.8 if ($is_space->{$self->{nc}} or
3505 wakaba 1.14 $self->{nc} == 0x003F or # ?
3506 wakaba 1.8 $self->{nc} == -1) {
3507 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3508     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3509     ## "DOCTYPE pi state": Parse error, switch to the "data
3510     ## state".
3511 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3512     line => $self->{line_prev},
3513     column => $self->{column_prev}
3514     - 1 * ($self->{nc} != -1));
3515     $self->{state} = BOGUS_COMMENT_STATE;
3516     ## Reconsume.
3517     $self->{ct} = {type => COMMENT_TOKEN,
3518     data => '?',
3519     line => $self->{line_prev},
3520     column => $self->{column_prev}
3521     - 1 * ($self->{nc} != -1),
3522     };
3523     redo A;
3524     } else {
3525 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3526 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3527     target => chr $self->{nc},
3528     data => '',
3529     line => $self->{line_prev},
3530     column => $self->{column_prev} - 1,
3531     };
3532     $self->{state} = PI_TARGET_STATE;
3533     !!!next-input-character;
3534     redo A;
3535     }
3536     } elsif ($self->{state} == PI_TARGET_STATE) {
3537     if ($is_space->{$self->{nc}}) {
3538     $self->{state} = PI_TARGET_AFTER_STATE;
3539     !!!next-input-character;
3540     redo A;
3541     } elsif ($self->{nc} == -1) {
3542     !!!parse-error (type => 'no pic'); ## TODO: type
3543 wakaba 1.13 if ($self->{in_subset}) {
3544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3545     } else {
3546     $self->{state} = DATA_STATE;
3547     $self->{s_kwd} = '';
3548     }
3549 wakaba 1.8 ## Reconsume.
3550     !!!emit ($self->{ct}); # pi
3551     redo A;
3552     } elsif ($self->{nc} == 0x003F) { # ?
3553     $self->{state} = PI_AFTER_STATE;
3554     !!!next-input-character;
3555     redo A;
3556     } else {
3557     ## XML5: typo ("tag name" -> "target")
3558     $self->{ct}->{target} .= chr $self->{nc}; # pi
3559     !!!next-input-character;
3560     redo A;
3561     }
3562     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3563     if ($is_space->{$self->{nc}}) {
3564     ## Stay in the state.
3565     !!!next-input-character;
3566     redo A;
3567     } else {
3568     $self->{state} = PI_DATA_STATE;
3569     ## Reprocess.
3570     redo A;
3571     }
3572     } elsif ($self->{state} == PI_DATA_STATE) {
3573     if ($self->{nc} == 0x003F) { # ?
3574     $self->{state} = PI_DATA_AFTER_STATE;
3575     !!!next-input-character;
3576     redo A;
3577     } elsif ($self->{nc} == -1) {
3578     !!!parse-error (type => 'no pic'); ## TODO: type
3579 wakaba 1.13 if ($self->{in_subset}) {
3580 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3581 wakaba 1.13 } else {
3582     $self->{state} = DATA_STATE;
3583     $self->{s_kwd} = '';
3584     }
3585 wakaba 1.8 ## Reprocess.
3586     !!!emit ($self->{ct}); # pi
3587     redo A;
3588     } else {
3589     $self->{ct}->{data} .= chr $self->{nc}; # pi
3590     $self->{read_until}->($self->{ct}->{data}, q[?],
3591     length $self->{ct}->{data});
3592     ## Stay in the state.
3593     !!!next-input-character;
3594     ## Reprocess.
3595     redo A;
3596     }
3597     } elsif ($self->{state} == PI_AFTER_STATE) {
3598 wakaba 1.14 ## XML5: Part of "Pi after state".
3599    
3600 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3601 wakaba 1.13 if ($self->{in_subset}) {
3602     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3603     } else {
3604     $self->{state} = DATA_STATE;
3605     $self->{s_kwd} = '';
3606     }
3607 wakaba 1.8 !!!next-input-character;
3608     !!!emit ($self->{ct}); # pi
3609     redo A;
3610     } elsif ($self->{nc} == 0x003F) { # ?
3611     !!!parse-error (type => 'no s after target', ## TODO: type
3612     line => $self->{line_prev},
3613     column => $self->{column_prev}); ## XML5: no error
3614     $self->{ct}->{data} .= '?';
3615     $self->{state} = PI_DATA_AFTER_STATE;
3616     !!!next-input-character;
3617     redo A;
3618     } else {
3619     !!!parse-error (type => 'no s after target', ## TODO: type
3620     line => $self->{line_prev},
3621     column => $self->{column_prev}
3622     + 1 * ($self->{nc} == -1)); ## XML5: no error
3623     $self->{ct}->{data} .= '?'; ## XML5: not appended
3624     $self->{state} = PI_DATA_STATE;
3625     ## Reprocess.
3626     redo A;
3627     }
3628     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3629 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3630    
3631 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3632 wakaba 1.13 if ($self->{in_subset}) {
3633     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3634     } else {
3635     $self->{state} = DATA_STATE;
3636     $self->{s_kwd} = '';
3637     }
3638 wakaba 1.8 !!!next-input-character;
3639     !!!emit ($self->{ct}); # pi
3640     redo A;
3641     } elsif ($self->{nc} == 0x003F) { # ?
3642     $self->{ct}->{data} .= '?';
3643     ## Stay in the state.
3644     !!!next-input-character;
3645     redo A;
3646     } else {
3647     $self->{ct}->{data} .= '?'; ## XML5: not appended
3648     $self->{state} = PI_DATA_STATE;
3649     ## Reprocess.
3650     redo A;
3651     }
3652 wakaba 1.12
3653     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3654     if ($self->{nc} == 0x003C) { # <
3655 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3656 wakaba 1.12 !!!next-input-character;
3657     redo A;
3658     } elsif ($self->{nc} == 0x0025) { # %
3659     ## XML5: Not defined yet.
3660    
3661     ## TODO:
3662     !!!next-input-character;
3663     redo A;
3664     } elsif ($self->{nc} == 0x005D) { # ]
3665 wakaba 1.13 delete $self->{in_subset};
3666 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3667     !!!next-input-character;
3668     redo A;
3669     } elsif ($is_space->{$self->{nc}}) {
3670     ## Stay in the state.
3671     !!!next-input-character;
3672     redo A;
3673     } elsif ($self->{nc} == -1) {
3674     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3675 wakaba 1.13 delete $self->{in_subset};
3676 wakaba 1.12 $self->{state} = DATA_STATE;
3677     $self->{s_kwd} = '';
3678     ## Reconsume.
3679 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3680 wakaba 1.12 redo A;
3681     } else {
3682     unless ($self->{internal_subset_tainted}) {
3683     ## XML5: No parse error.
3684     !!!parse-error (type => 'string in internal subset');
3685     $self->{internal_subset_tainted} = 1;
3686     }
3687     ## Stay in the state.
3688     !!!next-input-character;
3689     redo A;
3690     }
3691     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3692     if ($self->{nc} == 0x003E) { # >
3693     $self->{state} = DATA_STATE;
3694     $self->{s_kwd} = '';
3695     !!!next-input-character;
3696 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3697 wakaba 1.12 redo A;
3698     } elsif ($self->{nc} == -1) {
3699     !!!parse-error (type => 'unclosed DOCTYPE');
3700     $self->{state} = DATA_STATE;
3701     $self->{s_kwd} = '';
3702     ## Reconsume.
3703 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3704 wakaba 1.12 redo A;
3705     } else {
3706     ## XML5: No parse error and stay in the state.
3707     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3708    
3709 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3710     !!!next-input-character;
3711     redo A;
3712     }
3713     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3714     if ($self->{nc} == 0x003E) { # >
3715     $self->{state} = DATA_STATE;
3716     $self->{s_kwd} = '';
3717     !!!next-input-character;
3718     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3719     redo A;
3720     } elsif ($self->{nc} == -1) {
3721     $self->{state} = DATA_STATE;
3722     $self->{s_kwd} = '';
3723     ## Reconsume.
3724     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3725     redo A;
3726     } else {
3727     ## Stay in the state.
3728     !!!next-input-character;
3729     redo A;
3730     }
3731     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3732     if ($self->{nc} == 0x0021) { # !
3733 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3734 wakaba 1.13 !!!next-input-character;
3735     redo A;
3736     } elsif ($self->{nc} == 0x003F) { # ?
3737     $self->{state} = PI_STATE;
3738     !!!next-input-character;
3739     redo A;
3740     } elsif ($self->{nc} == -1) {
3741     !!!parse-error (type => 'bare stago');
3742     $self->{state} = DATA_STATE;
3743     $self->{s_kwd} = '';
3744     ## Reconsume.
3745     redo A;
3746     } else {
3747     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3748     line => $self->{line_prev},
3749     column => $self->{column_prev});
3750     $self->{state} = BOGUS_COMMENT_STATE;
3751     $self->{ct} = {type => COMMENT_TOKEN,
3752     data => '',
3753     }; ## NOTE: Will be discarded.
3754 wakaba 1.12 !!!next-input-character;
3755     redo A;
3756     }
3757 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3758     ## XML5: "DOCTYPE markup declaration state".
3759    
3760     if ($self->{nc} == 0x002D) { # -
3761     $self->{state} = MD_HYPHEN_STATE;
3762     !!!next-input-character;
3763     redo A;
3764 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3765     $self->{nc} == 0x0065) { # e
3766 wakaba 1.14 $self->{state} = MD_E_STATE;
3767     $self->{kwd} = chr $self->{nc};
3768     !!!next-input-character;
3769     redo A;
3770 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3771     $self->{nc} == 0x0061) { # a
3772 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3773     $self->{kwd} = chr $self->{nc};
3774     !!!next-input-character;
3775     redo A;
3776 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3777     $self->{nc} == 0x006E) { # n
3778 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3779     $self->{kwd} = chr $self->{nc};
3780     !!!next-input-character;
3781     redo A;
3782     } else {
3783     #
3784     }
3785    
3786     ## XML5: No parse error.
3787     !!!parse-error (type => 'bogus comment',
3788     line => $self->{line_prev},
3789     column => $self->{column_prev} - 1);
3790     ## Reconsume.
3791     $self->{state} = BOGUS_COMMENT_STATE;
3792     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3793     redo A;
3794     } elsif ($self->{state} == MD_E_STATE) {
3795 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3796     $self->{nc} == 0x006E) { # n
3797 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3798     $self->{kwd} .= chr $self->{nc};
3799     !!!next-input-character;
3800     redo A;
3801 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3802     $self->{nc} == 0x006C) { # l
3803 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3804     $self->{state} = MD_ELEMENT_STATE;
3805     $self->{kwd} .= chr $self->{nc};
3806     !!!next-input-character;
3807     redo A;
3808     } else {
3809     ## XML5: No parse error.
3810     !!!parse-error (type => 'bogus comment',
3811     line => $self->{line_prev},
3812     column => $self->{column_prev} - 2
3813     + 1 * ($self->{nc} == -1));
3814     ## Reconsume.
3815     $self->{state} = BOGUS_COMMENT_STATE;
3816     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3817     redo A;
3818     }
3819     } elsif ($self->{state} == MD_ENTITY_STATE) {
3820 wakaba 1.17 if ($self->{nc} == [
3821     undef,
3822     undef,
3823     0x0054, # T
3824     0x0049, # I
3825     0x0054, # T
3826     ]->[length $self->{kwd}] or
3827     $self->{nc} == [
3828     undef,
3829     undef,
3830     0x0074, # t
3831     0x0069, # i
3832     0x0074, # t
3833     ]->[length $self->{kwd}]) {
3834 wakaba 1.14 ## Stay in the state.
3835     $self->{kwd} .= chr $self->{nc};
3836     !!!next-input-character;
3837     redo A;
3838 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3839     ($self->{nc} == 0x0059 or # Y
3840     $self->{nc} == 0x0079)) { # y
3841     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3842     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3843     text => 'ENTITY',
3844     line => $self->{line_prev},
3845     column => $self->{column_prev} - 4);
3846     }
3847     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3848 wakaba 1.14 line => $self->{line_prev},
3849     column => $self->{column_prev} - 6};
3850     $self->{state} = DOCTYPE_MD_STATE;
3851     !!!next-input-character;
3852     redo A;
3853     } else {
3854     !!!parse-error (type => 'bogus comment',
3855     line => $self->{line_prev},
3856     column => $self->{column_prev} - 1
3857     - (length $self->{kwd})
3858     + 1 * ($self->{nc} == -1));
3859     $self->{state} = BOGUS_COMMENT_STATE;
3860     ## Reconsume.
3861     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3862     redo A;
3863     }
3864     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3865 wakaba 1.17 if ($self->{nc} == [
3866     undef,
3867     undef,
3868     0x0045, # E
3869     0x004D, # M
3870     0x0045, # E
3871     0x004E, # N
3872     ]->[length $self->{kwd}] or
3873     $self->{nc} == [
3874     undef,
3875     undef,
3876     0x0065, # e
3877     0x006D, # m
3878     0x0065, # e
3879     0x006E, # n
3880     ]->[length $self->{kwd}]) {
3881 wakaba 1.14 ## Stay in the state.
3882     $self->{kwd} .= chr $self->{nc};
3883     !!!next-input-character;
3884     redo A;
3885 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3886     ($self->{nc} == 0x0054 or # T
3887     $self->{nc} == 0x0074)) { # t
3888     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3889     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3890     text => 'ELEMENT',
3891     line => $self->{line_prev},
3892     column => $self->{column_prev} - 5);
3893     }
3894 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3895     line => $self->{line_prev},
3896     column => $self->{column_prev} - 6};
3897     $self->{state} = DOCTYPE_MD_STATE;
3898     !!!next-input-character;
3899     redo A;
3900     } else {
3901     !!!parse-error (type => 'bogus comment',
3902     line => $self->{line_prev},
3903     column => $self->{column_prev} - 1
3904     - (length $self->{kwd})
3905     + 1 * ($self->{nc} == -1));
3906     $self->{state} = BOGUS_COMMENT_STATE;
3907     ## Reconsume.
3908     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3909     redo A;
3910     }
3911     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3912 wakaba 1.17 if ($self->{nc} == [
3913     undef,
3914     0x0054, # T
3915     0x0054, # T
3916     0x004C, # L
3917     0x0049, # I
3918     0x0053, # S
3919     ]->[length $self->{kwd}] or
3920     $self->{nc} == [
3921     undef,
3922     0x0074, # t
3923     0x0074, # t
3924     0x006C, # l
3925     0x0069, # i
3926     0x0073, # s
3927     ]->[length $self->{kwd}]) {
3928 wakaba 1.14 ## Stay in the state.
3929     $self->{kwd} .= chr $self->{nc};
3930     !!!next-input-character;
3931     redo A;
3932 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3933     ($self->{nc} == 0x0054 or # T
3934     $self->{nc} == 0x0074)) { # t
3935     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3936     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3937     text => 'ATTLIST',
3938     line => $self->{line_prev},
3939     column => $self->{column_prev} - 5);
3940     }
3941 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3942 wakaba 1.15 attrdefs => [],
3943 wakaba 1.14 line => $self->{line_prev},
3944     column => $self->{column_prev} - 6};
3945     $self->{state} = DOCTYPE_MD_STATE;
3946     !!!next-input-character;
3947     redo A;
3948     } else {
3949     !!!parse-error (type => 'bogus comment',
3950     line => $self->{line_prev},
3951     column => $self->{column_prev} - 1
3952     - (length $self->{kwd})
3953     + 1 * ($self->{nc} == -1));
3954     $self->{state} = BOGUS_COMMENT_STATE;
3955     ## Reconsume.
3956     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3957     redo A;
3958     }
3959     } elsif ($self->{state} == MD_NOTATION_STATE) {
3960 wakaba 1.17 if ($self->{nc} == [
3961     undef,
3962     0x004F, # O
3963     0x0054, # T
3964     0x0041, # A
3965     0x0054, # T
3966     0x0049, # I
3967     0x004F, # O
3968     ]->[length $self->{kwd}] or
3969     $self->{nc} == [
3970     undef,
3971     0x006F, # o
3972     0x0074, # t
3973     0x0061, # a
3974     0x0074, # t
3975     0x0069, # i
3976     0x006F, # o
3977     ]->[length $self->{kwd}]) {
3978 wakaba 1.14 ## Stay in the state.
3979     $self->{kwd} .= chr $self->{nc};
3980     !!!next-input-character;
3981     redo A;
3982 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
3983     ($self->{nc} == 0x004E or # N
3984     $self->{nc} == 0x006E)) { # n
3985     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3986     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3987     text => 'NOTATION',
3988     line => $self->{line_prev},
3989     column => $self->{column_prev} - 6);
3990     }
3991 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3992     line => $self->{line_prev},
3993     column => $self->{column_prev} - 6};
3994     $self->{state} = DOCTYPE_MD_STATE;
3995     !!!next-input-character;
3996     redo A;
3997     } else {
3998     !!!parse-error (type => 'bogus comment',
3999     line => $self->{line_prev},
4000     column => $self->{column_prev} - 1
4001     - (length $self->{kwd})
4002     + 1 * ($self->{nc} == -1));
4003     $self->{state} = BOGUS_COMMENT_STATE;
4004     ## Reconsume.
4005     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4006     redo A;
4007     }
4008     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4009     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4010     ## "DOCTYPE NOTATION state".
4011    
4012     if ($is_space->{$self->{nc}}) {
4013     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4014     $self->{state} = BEFORE_MD_NAME_STATE;
4015     !!!next-input-character;
4016     redo A;
4017     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4018     $self->{nc} == 0x0025) { # %
4019     ## XML5: Switch to the "DOCTYPE bogus comment state".
4020     !!!parse-error (type => 'no space before md name'); ## TODO: type
4021     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4022     !!!next-input-character;
4023     redo A;
4024     } elsif ($self->{nc} == -1) {
4025     !!!parse-error (type => 'unclosed md'); ## TODO: type
4026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4027     ## Reconsume.
4028     redo A;
4029     } elsif ($self->{nc} == 0x003E) { # >
4030     ## XML5: Switch to the "DOCTYPE bogus comment state".
4031     !!!parse-error (type => 'no md name'); ## TODO: type
4032     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4033     !!!next-input-character;
4034     redo A;
4035     } else {
4036     ## XML5: Switch to the "DOCTYPE bogus comment state".
4037     !!!parse-error (type => 'no space before md name'); ## TODO: type
4038     $self->{state} = BEFORE_MD_NAME_STATE;
4039     redo A;
4040     }
4041     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4042     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4043     ## before state", "DOCTYPE ATTLIST name before state".
4044    
4045     if ($is_space->{$self->{nc}}) {
4046     ## Stay in the state.
4047     !!!next-input-character;
4048     redo A;
4049     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4050     $self->{nc} == 0x0025) { # %
4051     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4052     !!!next-input-character;
4053     redo A;
4054     } elsif ($self->{nc} == 0x003E) { # >
4055     ## XML5: Same as "Anything else".
4056     !!!parse-error (type => 'no md name'); ## TODO: type
4057     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4058     !!!next-input-character;
4059     redo A;
4060     } elsif ($self->{nc} == -1) {
4061     !!!parse-error (type => 'unclosed md'); ## TODO: type
4062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4063     ## Reconsume.
4064     redo A;
4065     } else {
4066     ## XML5: [ATTLIST] Not defined yet.
4067     $self->{ct}->{name} .= chr $self->{nc};
4068     $self->{state} = MD_NAME_STATE;
4069     !!!next-input-character;
4070     redo A;
4071     }
4072     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4073     if ($is_space->{$self->{nc}}) {
4074     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4075     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4076     $self->{state} = BEFORE_MD_NAME_STATE;
4077     !!!next-input-character;
4078     redo A;
4079     } elsif ($self->{nc} == 0x003E) { # >
4080     ## XML5: Same as "Anything else".
4081     !!!parse-error (type => 'no md name'); ## TODO: type
4082     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4083     !!!next-input-character;
4084     redo A;
4085     } elsif ($self->{nc} == -1) {
4086     !!!parse-error (type => 'unclosed md');
4087     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4088     ## Reconsume.
4089     redo A;
4090     } else {
4091     ## XML5: No parse error.
4092     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4093     $self->{state} = BOGUS_COMMENT_STATE;
4094     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4095     ## Reconsume.
4096     redo A;
4097     }
4098     } elsif ($self->{state} == MD_NAME_STATE) {
4099     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4100    
4101     if ($is_space->{$self->{nc}}) {
4102 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4103     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4104     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4105 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4106 wakaba 1.16 } else { # ENTITY/NOTATION
4107     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4108     }
4109 wakaba 1.14 !!!next-input-character;
4110     redo A;
4111     } elsif ($self->{nc} == 0x003E) { # >
4112     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4113     #
4114     } else {
4115 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4116 wakaba 1.14 }
4117     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4118     !!!next-input-character;
4119     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4120     redo A;
4121     } elsif ($self->{nc} == -1) {
4122     ## XML5: [ATTLIST] No parse error.
4123     !!!parse-error (type => 'unclosed md');
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4125     ## Reconsume.
4126     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4127     redo A;
4128     } else {
4129     ## XML5: [ATTLIST] Not defined yet.
4130     $self->{ct}->{name} .= chr $self->{nc};
4131     ## Stay in the state.
4132     !!!next-input-character;
4133     redo A;
4134     }
4135     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4136     if ($is_space->{$self->{nc}}) {
4137     ## Stay in the state.
4138     !!!next-input-character;
4139     redo A;
4140     } elsif ($self->{nc} == 0x003E) { # >
4141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4142     !!!next-input-character;
4143     !!!emit ($self->{ct}); # ATTLIST
4144     redo A;
4145     } elsif ($self->{nc} == -1) {
4146     ## XML5: No parse error.
4147     !!!parse-error (type => 'unclosed md'); ## TODO: type
4148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4149 wakaba 1.15 !!!emit ($self->{ct});
4150     redo A;
4151     } else {
4152     ## XML5: Not defined yet.
4153     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4154     tokens => [],
4155     line => $self->{line}, column => $self->{column}};
4156     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4157     !!!next-input-character;
4158     redo A;
4159     }
4160     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4161     if ($is_space->{$self->{nc}}) {
4162     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4163     !!!next-input-character;
4164     redo A;
4165     } elsif ($self->{nc} == 0x003E) { # >
4166     ## XML5: Same as "anything else".
4167     !!!parse-error (type => 'no attr type'); ## TODO: type
4168     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4169     !!!next-input-character;
4170     !!!emit ($self->{ct}); # ATTLIST
4171     redo A;
4172     } elsif ($self->{nc} == 0x0028) { # (
4173     ## XML5: Same as "anything else".
4174     !!!parse-error (type => 'no space before paren'); ## TODO: type
4175     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4176     !!!next-input-character;
4177     redo A;
4178     } elsif ($self->{nc} == -1) {
4179     ## XML5: No parse error.
4180     !!!parse-error (type => 'unclosed md'); ## TODO: type
4181     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4182     !!!next-input-character;
4183     !!!emit ($self->{ct}); # ATTLIST
4184     redo A;
4185     } else {
4186     ## XML5: Not defined yet.
4187     $self->{ca}->{name} .= chr $self->{nc};
4188     ## Stay in the state.
4189     !!!next-input-character;
4190     redo A;
4191     }
4192     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4193     if ($is_space->{$self->{nc}}) {
4194     ## Stay in the state.
4195     !!!next-input-character;
4196     redo A;
4197     } elsif ($self->{nc} == 0x003E) { # >
4198     ## XML5: Same as "anything else".
4199     !!!parse-error (type => 'no attr type'); ## TODO: type
4200     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4201     !!!next-input-character;
4202     !!!emit ($self->{ct}); # ATTLIST
4203     redo A;
4204     } elsif ($self->{nc} == 0x0028) { # (
4205     ## XML5: Same as "anything else".
4206     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4207     !!!next-input-character;
4208     redo A;
4209     } elsif ($self->{nc} == -1) {
4210     ## XML5: No parse error.
4211     !!!parse-error (type => 'unclosed md'); ## TODO: type
4212     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4213     !!!next-input-character;
4214     !!!emit ($self->{ct});
4215 wakaba 1.14 redo A;
4216     } else {
4217     ## XML5: Not defined yet.
4218 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4219     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4220     !!!next-input-character;
4221     redo A;
4222     }
4223     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4224     if ($is_space->{$self->{nc}}) {
4225     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4226     !!!next-input-character;
4227     redo A;
4228     } elsif ($self->{nc} == 0x0023) { # #
4229     ## XML5: Same as "anything else".
4230     !!!parse-error (type => 'no space before default value'); ## TODO: type
4231     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4232     !!!next-input-character;
4233     redo A;
4234     } elsif ($self->{nc} == 0x0022) { # "
4235     ## XML5: Same as "anything else".
4236     !!!parse-error (type => 'no space before default value'); ## TODO: type
4237     $self->{ca}->{value} = '';
4238     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4239     !!!next-input-character;
4240     redo A;
4241     } elsif ($self->{nc} == 0x0027) { # '
4242     ## XML5: Same as "anything else".
4243     !!!parse-error (type => 'no space before default value'); ## TODO: type
4244     $self->{ca}->{value} = '';
4245     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4246     !!!next-input-character;
4247     redo A;
4248     } elsif ($self->{nc} == 0x003E) { # >
4249     ## XML5: Same as "anything else".
4250     !!!parse-error (type => 'no attr default'); ## TODO: type
4251     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4252     !!!next-input-character;
4253     !!!emit ($self->{ct}); # ATTLIST
4254     redo A;
4255     } elsif ($self->{nc} == 0x0028) { # (
4256     ## XML5: Same as "anything else".
4257     !!!parse-error (type => 'no space before paren'); ## TODO: type
4258     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4259     !!!next-input-character;
4260     redo A;
4261     } elsif ($self->{nc} == -1) {
4262     ## XML5: No parse error.
4263     !!!parse-error (type => 'unclosed md'); ## TODO: type
4264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4265     !!!next-input-character;
4266     !!!emit ($self->{ct});
4267     redo A;
4268     } else {
4269     ## XML5: Not defined yet.
4270     $self->{ca}->{type} .= chr $self->{nc};
4271     ## Stay in the state.
4272     !!!next-input-character;
4273     redo A;
4274     }
4275     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4276     if ($is_space->{$self->{nc}}) {
4277     ## Stay in the state.
4278     !!!next-input-character;
4279     redo A;
4280     } elsif ($self->{nc} == 0x0028) { # (
4281     ## XML5: Same as "anything else".
4282     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4283     !!!next-input-character;
4284     redo A;
4285     } elsif ($self->{nc} == 0x0023) { # #
4286     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4287     !!!next-input-character;
4288     redo A;
4289     } elsif ($self->{nc} == 0x0022) { # "
4290     ## XML5: Same as "anything else".
4291     $self->{ca}->{value} = '';
4292     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4293     !!!next-input-character;
4294     redo A;
4295     } elsif ($self->{nc} == 0x0027) { # '
4296     ## XML5: Same as "anything else".
4297     $self->{ca}->{value} = '';
4298     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4299     !!!next-input-character;
4300     redo A;
4301     } elsif ($self->{nc} == 0x003E) { # >
4302     ## XML5: Same as "anything else".
4303     !!!parse-error (type => 'no attr default'); ## TODO: type
4304     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4305     !!!next-input-character;
4306     !!!emit ($self->{ct}); # ATTLIST
4307     redo A;
4308     } elsif ($self->{nc} == -1) {
4309     ## XML5: No parse error.
4310     !!!parse-error (type => 'unclosed md'); ## TODO: type
4311     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4312     !!!next-input-character;
4313     !!!emit ($self->{ct});
4314     redo A;
4315     } else {
4316     ## XML5: Switch to the "DOCTYPE bogus comment state".
4317     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4318     $self->{ca}->{value} = '';
4319     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4320     ## Reconsume.
4321     redo A;
4322     }
4323     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4324     if ($is_space->{$self->{nc}}) {
4325     ## Stay in the state.
4326     !!!next-input-character;
4327     redo A;
4328     } elsif ($self->{nc} == 0x007C) { # |
4329     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4330     ## Stay in the state.
4331     !!!next-input-character;
4332     redo A;
4333     } elsif ($self->{nc} == 0x0029) { # )
4334     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4335     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4336     !!!next-input-character;
4337     redo A;
4338     } elsif ($self->{nc} == 0x003E) { # >
4339     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4340     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4341     !!!next-input-character;
4342     !!!emit ($self->{ct}); # ATTLIST
4343     redo A;
4344     } elsif ($self->{nc} == -1) {
4345     ## XML5: No parse error.
4346     !!!parse-error (type => 'unclosed md'); ## TODO: type
4347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4348     !!!next-input-character;
4349     !!!emit ($self->{ct});
4350     redo A;
4351     } else {
4352     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4353     $self->{state} = ALLOWED_TOKEN_STATE;
4354     !!!next-input-character;
4355     redo A;
4356     }
4357     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4358     if ($is_space->{$self->{nc}}) {
4359     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4360     !!!next-input-character;
4361     redo A;
4362     } elsif ($self->{nc} == 0x007C) { # |
4363     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4364     !!!next-input-character;
4365     redo A;
4366     } elsif ($self->{nc} == 0x0029) { # )
4367     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4368     !!!next-input-character;
4369     redo A;
4370     } elsif ($self->{nc} == 0x003E) { # >
4371     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4372     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4373     !!!next-input-character;
4374     !!!emit ($self->{ct}); # ATTLIST
4375     redo A;
4376     } elsif ($self->{nc} == -1) {
4377     ## XML5: No parse error.
4378     !!!parse-error (type => 'unclosed md'); ## TODO: type
4379     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4380     !!!next-input-character;
4381     !!!emit ($self->{ct});
4382     redo A;
4383     } else {
4384     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4385     ## Stay in the state.
4386     !!!next-input-character;
4387     redo A;
4388     }
4389     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4390     if ($is_space->{$self->{nc}}) {
4391     ## Stay in the state.
4392     !!!next-input-character;
4393     redo A;
4394     } elsif ($self->{nc} == 0x007C) { # |
4395     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4396     !!!next-input-character;
4397     redo A;
4398     } elsif ($self->{nc} == 0x0029) { # )
4399     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4400     !!!next-input-character;
4401     redo A;
4402     } elsif ($self->{nc} == 0x003E) { # >
4403     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4404     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4405     !!!next-input-character;
4406     !!!emit ($self->{ct}); # ATTLIST
4407     redo A;
4408     } elsif ($self->{nc} == -1) {
4409     ## XML5: No parse error.
4410     !!!parse-error (type => 'unclosed md'); ## TODO: type
4411     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4412     !!!next-input-character;
4413     !!!emit ($self->{ct});
4414     redo A;
4415     } else {
4416     !!!parse-error (type => 'space in allowed token', ## TODO: type
4417     line => $self->{line_prev},
4418     column => $self->{column_prev});
4419     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4420     $self->{state} = ALLOWED_TOKEN_STATE;
4421     !!!next-input-character;
4422     redo A;
4423     }
4424     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4425     if ($is_space->{$self->{nc}}) {
4426     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4427     !!!next-input-character;
4428     redo A;
4429     } elsif ($self->{nc} == 0x0023) { # #
4430     !!!parse-error (type => 'no space before default value'); ## TODO: type
4431     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4432     !!!next-input-character;
4433     redo A;
4434     } elsif ($self->{nc} == 0x0022) { # "
4435     !!!parse-error (type => 'no space before default value'); ## TODO: type
4436     $self->{ca}->{value} = '';
4437     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4438     !!!next-input-character;
4439     redo A;
4440     } elsif ($self->{nc} == 0x0027) { # '
4441     !!!parse-error (type => 'no space before default value'); ## TODO: type
4442     $self->{ca}->{value} = '';
4443     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4444     !!!next-input-character;
4445     redo A;
4446     } elsif ($self->{nc} == 0x003E) { # >
4447     !!!parse-error (type => 'no attr default'); ## TODO: type
4448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4449     !!!next-input-character;
4450     !!!emit ($self->{ct}); # ATTLIST
4451     redo A;
4452     } elsif ($self->{nc} == -1) {
4453     !!!parse-error (type => 'unclosed md'); ## TODO: type
4454     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4455     !!!next-input-character;
4456     !!!emit ($self->{ct});
4457     redo A;
4458     } else {
4459     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4460     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4461     ## Reconsume.
4462     redo A;
4463     }
4464     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4465     if ($is_space->{$self->{nc}}) {
4466     ## Stay in the state.
4467     !!!next-input-character;
4468     redo A;
4469     } elsif ($self->{nc} == 0x0023) { # #
4470     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4471     !!!next-input-character;
4472     redo A;
4473     } elsif ($self->{nc} == 0x0022) { # "
4474     $self->{ca}->{value} = '';
4475     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4476     !!!next-input-character;
4477     redo A;
4478     } elsif ($self->{nc} == 0x0027) { # '
4479     $self->{ca}->{value} = '';
4480     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4481     !!!next-input-character;
4482     redo A;
4483     } elsif ($self->{nc} == 0x003E) { # >
4484     !!!parse-error (type => 'no attr default'); ## TODO: type
4485     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4486     !!!next-input-character;
4487     !!!emit ($self->{ct}); # ATTLIST
4488     redo A;
4489     } elsif ($self->{nc} == -1) {
4490     !!!parse-error (type => 'unclosed md'); ## TODO: type
4491     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4492     !!!next-input-character;
4493     !!!emit ($self->{ct});
4494     redo A;
4495     } else {
4496     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4497     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4498     ## Reconsume.
4499     redo A;
4500     }
4501     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4502     if ($is_space->{$self->{nc}}) {
4503     ## XML5: No parse error.
4504     !!!parse-error (type => 'no default type'); ## TODO: type
4505 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4506 wakaba 1.14 ## Reconsume.
4507     redo A;
4508 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4509     ## XML5: Same as "anything else".
4510     $self->{ca}->{value} = '';
4511     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512     !!!next-input-character;
4513     redo A;
4514     } elsif ($self->{nc} == 0x0027) { # '
4515     ## XML5: Same as "anything else".
4516     $self->{ca}->{value} = '';
4517     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4518     !!!next-input-character;
4519     redo A;
4520     } elsif ($self->{nc} == 0x003E) { # >
4521     ## XML5: Same as "anything else".
4522     !!!parse-error (type => 'no attr default'); ## TODO: type
4523     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4524     !!!next-input-character;
4525     !!!emit ($self->{ct}); # ATTLIST
4526     redo A;
4527     } elsif ($self->{nc} == -1) {
4528     ## XML5: No parse error.
4529     !!!parse-error (type => 'unclosed md'); ## TODO: type
4530     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4531     !!!next-input-character;
4532     !!!emit ($self->{ct});
4533     redo A;
4534     } else {
4535     $self->{ca}->{default} = chr $self->{nc};
4536     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4537     !!!next-input-character;
4538     redo A;
4539 wakaba 1.14 }
4540 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4541     if ($is_space->{$self->{nc}}) {
4542     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4543     !!!next-input-character;
4544     redo A;
4545     } elsif ($self->{nc} == 0x0022) { # "
4546     ## XML5: Same as "anything else".
4547     !!!parse-error (type => 'no space before default value'); ## TODO: type
4548     $self->{ca}->{value} = '';
4549     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4550     !!!next-input-character;
4551     redo A;
4552     } elsif ($self->{nc} == 0x0027) { # '
4553     ## XML5: Same as "anything else".
4554     !!!parse-error (type => 'no space before default value'); ## TODO: type
4555     $self->{ca}->{value} = '';
4556     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4557     !!!next-input-character;
4558     redo A;
4559     } elsif ($self->{nc} == 0x003E) { # >
4560     ## XML5: Same as "anything else".
4561     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4562     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4563     !!!next-input-character;
4564     !!!emit ($self->{ct}); # ATTLIST
4565     redo A;
4566     } elsif ($self->{nc} == -1) {
4567     ## XML5: No parse error.
4568     !!!parse-error (type => 'unclosed md'); ## TODO: type
4569     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4570     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4571     !!!next-input-character;
4572     !!!emit ($self->{ct});
4573     redo A;
4574     } else {
4575     $self->{ca}->{default} .= chr $self->{nc};
4576     ## Stay in the state.
4577     !!!next-input-character;
4578     redo A;
4579     }
4580     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4581     if ($is_space->{$self->{nc}}) {
4582     ## Stay in the state.
4583     !!!next-input-character;
4584     redo A;
4585     } elsif ($self->{nc} == 0x0022) { # "
4586     $self->{ca}->{value} = '';
4587     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4588     !!!next-input-character;
4589     redo A;
4590     } elsif ($self->{nc} == 0x0027) { # '
4591     $self->{ca}->{value} = '';
4592     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4593     !!!next-input-character;
4594     redo A;
4595     } elsif ($self->{nc} == 0x003E) { # >
4596     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4597     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4598     !!!next-input-character;
4599     !!!emit ($self->{ct}); # ATTLIST
4600     redo A;
4601     } elsif ($self->{nc} == -1) {
4602     ## XML5: No parse error.
4603     !!!parse-error (type => 'unclosed md'); ## TODO: type
4604     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4605     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4606     !!!next-input-character;
4607     !!!emit ($self->{ct});
4608     redo A;
4609     } else {
4610     ## XML5: Not defined yet.
4611     if ($self->{ca}->{default} eq 'FIXED') {
4612     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4613     } else {
4614     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4615     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4616     }
4617     ## Reconsume.
4618     redo A;
4619     }
4620     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4621     if ($is_space->{$self->{nc}} or
4622     $self->{nc} == -1 or
4623     $self->{nc} == 0x003E) { # >
4624     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4625     ## Reconsume.
4626     redo A;
4627     } else {
4628     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4629     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4630     ## Reconsume.
4631     redo A;
4632 wakaba 1.16 }
4633 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4634     ## ASCII case-insensitive
4635     if ($self->{nc} == [
4636     undef,
4637     0x0044, # D
4638     0x0041, # A
4639     0x0054, # T
4640     ]->[length $self->{kwd}] or
4641     $self->{nc} == [
4642     undef,
4643     0x0064, # d
4644     0x0061, # a
4645     0x0074, # t
4646     ]->[length $self->{kwd}]) {
4647     !!!cp (172.2);
4648     ## Stay in the state.
4649     $self->{kwd} .= chr $self->{nc};
4650     !!!next-input-character;
4651     redo A;
4652     } elsif ((length $self->{kwd}) == 4 and
4653     ($self->{nc} == 0x0041 or # A
4654     $self->{nc} == 0x0061)) { # a
4655     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4656     !!!cp (172.3);
4657     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4658     text => 'NDATA',
4659     line => $self->{line_prev},
4660     column => $self->{column_prev} - 4);
4661     } else {
4662     !!!cp (172.4);
4663     }
4664     $self->{state} = AFTER_NDATA_STATE;
4665     !!!next-input-character;
4666     redo A;
4667     } else {
4668     !!!parse-error (type => 'string after literal', ## TODO: type
4669     line => $self->{line_prev},
4670     column => $self->{column_prev} + 1
4671     - length $self->{kwd});
4672     !!!cp (172.5);
4673     $self->{state} = BOGUS_MD_STATE;
4674     ## Reconsume.
4675     redo A;
4676     }
4677     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4678     if ($is_space->{$self->{nc}}) {
4679     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4680     !!!next-input-character;
4681     redo A;
4682     } elsif ($self->{nc} == 0x003E) { # >
4683     !!!parse-error (type => 'no notation name'); ## TODO: type
4684     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4685     !!!next-input-character;
4686     !!!emit ($self->{ct}); # ENTITY
4687     redo A;
4688     } elsif ($self->{nc} == -1) {
4689     !!!parse-error (type => 'unclosed md'); ## TODO: type
4690     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4691     !!!next-input-character;
4692     !!!emit ($self->{ct}); # ENTITY
4693     redo A;
4694     } else {
4695     !!!parse-error (type => 'string after literal', ## TODO: type
4696     line => $self->{line_prev},
4697     column => $self->{column_prev} + 1
4698     - length $self->{kwd});
4699     $self->{state} = BOGUS_MD_STATE;
4700     ## Reconsume.
4701     redo A;
4702     }
4703     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4704     if ($is_space->{$self->{nc}}) {
4705     ## Stay in the state.
4706     !!!next-input-character;
4707     redo A;
4708     } elsif ($self->{nc} == 0x003E) { # >
4709     !!!parse-error (type => 'no notation name'); ## TODO: type
4710     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4711     !!!next-input-character;
4712     !!!emit ($self->{ct}); # ENTITY
4713     redo A;
4714     } elsif ($self->{nc} == -1) {
4715     !!!parse-error (type => 'unclosed md'); ## TODO: type
4716     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4717     !!!next-input-character;
4718     !!!emit ($self->{ct}); # ENTITY
4719     redo A;
4720     } else {
4721     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4722     $self->{state} = NOTATION_NAME_STATE;
4723     !!!next-input-character;
4724     redo A;
4725     }
4726     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4727     if ($is_space->{$self->{nc}}) {
4728 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4729 wakaba 1.18 !!!next-input-character;
4730     redo A;
4731     } elsif ($self->{nc} == 0x003E) { # >
4732     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4733     !!!next-input-character;
4734     !!!emit ($self->{ct}); # ENTITY
4735     redo A;
4736     } elsif ($self->{nc} == -1) {
4737     !!!parse-error (type => 'unclosed md'); ## TODO: type
4738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4739     !!!next-input-character;
4740     !!!emit ($self->{ct}); # ENTITY
4741     redo A;
4742     } else {
4743     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4744     ## Stay in the state.
4745     !!!next-input-character;
4746     redo A;
4747     }
4748 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4749     if ($self->{nc} == 0x0022) { # "
4750 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4751 wakaba 1.19 !!!next-input-character;
4752     redo A;
4753     } elsif ($self->{nc} == 0x0026) { # &
4754     $self->{prev_state} = $self->{state};
4755     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4756     $self->{entity_add} = 0x0022; # "
4757     !!!next-input-character;
4758     redo A;
4759     ## TODO: %
4760     } elsif ($self->{nc} == -1) {
4761     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4762     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4763     ## Reconsume.
4764     !!!emit ($self->{ct}); # ENTITY
4765     redo A;
4766     } else {
4767     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4768     !!!next-input-character;
4769     redo A;
4770     }
4771     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4772     if ($self->{nc} == 0x0027) { # '
4773 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4774 wakaba 1.19 !!!next-input-character;
4775     redo A;
4776     } elsif ($self->{nc} == 0x0026) { # &
4777     $self->{prev_state} = $self->{state};
4778     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4779     $self->{entity_add} = 0x0027; # '
4780     !!!next-input-character;
4781     redo A;
4782     ## TODO: %
4783     } elsif ($self->{nc} == -1) {
4784     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4785     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4786     ## Reconsume.
4787     !!!emit ($self->{ct}); # ENTITY
4788     redo A;
4789     } else {
4790     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4791     !!!next-input-character;
4792     redo A;
4793     }
4794     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4795     if ($is_space->{$self->{nc}} or
4796     {
4797     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4798     $self->{entity_add} => 1,
4799     }->{$self->{nc}}) {
4800 wakaba 1.22 !!!parse-error (type => 'bare ero',
4801     line => $self->{line_prev},
4802     column => $self->{column_prev}
4803     + ($self->{nc} == -1 ? 1 : 0));
4804 wakaba 1.19 ## Don't consume
4805     ## Return nothing.
4806     #
4807     } elsif ($self->{nc} == 0x0023) { # #
4808     $self->{ca} = $self->{ct};
4809     $self->{state} = ENTITY_HASH_STATE;
4810     $self->{kwd} = '#';
4811     !!!next-input-character;
4812     redo A;
4813     } else {
4814     #
4815     }
4816    
4817     $self->{ct}->{value} .= '&';
4818     $self->{state} = $self->{prev_state};
4819     ## Reconsume.
4820     redo A;
4821 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4822     if ($is_space->{$self->{nc}}) {
4823     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4824     !!!next-input-character;
4825     redo A;
4826     } elsif ($self->{nc} == 0x0028) { # (
4827     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4828     $self->{ct}->{content} = ['('];
4829     $self->{group_depth} = 1;
4830     !!!next-input-character;
4831     redo A;
4832     } elsif ($self->{nc} == 0x003E) { # >
4833     !!!parse-error (type => 'no md def'); ## TODO: type
4834     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4835     !!!next-input-character;
4836     !!!emit ($self->{ct}); # ELEMENT
4837     redo A;
4838     } elsif ($self->{nc} == -1) {
4839     !!!parse-error (type => 'unclosed md'); ## TODO: type
4840     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4841     !!!next-input-character;
4842     !!!emit ($self->{ct}); # ELEMENT
4843     redo A;
4844     } else {
4845     $self->{ct}->{content} = [chr $self->{nc}];
4846     $self->{state} = CONTENT_KEYWORD_STATE;
4847     !!!next-input-character;
4848     redo A;
4849     }
4850     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4851     if ($is_space->{$self->{nc}}) {
4852     $self->{state} = AFTER_MD_DEF_STATE;
4853     !!!next-input-character;
4854     redo A;
4855     } elsif ($self->{nc} == 0x003E) { # >
4856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4857     !!!next-input-character;
4858     !!!emit ($self->{ct}); # ELEMENT
4859     redo A;
4860     } elsif ($self->{nc} == -1) {
4861     !!!parse-error (type => 'unclosed md'); ## TODO: type
4862     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4863     !!!next-input-character;
4864     !!!emit ($self->{ct}); # ELEMENT
4865     redo A;
4866     } else {
4867     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4868     ## Stay in the state.
4869     !!!next-input-character;
4870     redo A;
4871     }
4872     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4873     if ($is_space->{$self->{nc}}) {
4874     ## Stay in the state.
4875     !!!next-input-character;
4876     redo A;
4877     } elsif ($self->{nc} == 0x0028) { # (
4878     $self->{group_depth}++;
4879     push @{$self->{ct}->{content}}, chr $self->{nc};
4880     ## Stay in the state.
4881     !!!next-input-character;
4882     redo A;
4883     } elsif ($self->{nc} == 0x007C or # |
4884     $self->{nc} == 0x002C) { # ,
4885     !!!parse-error (type => 'empty element name'); ## TODO: type
4886     ## Stay in the state.
4887     !!!next-input-character;
4888     redo A;
4889     } elsif ($self->{nc} == 0x0029) { # )
4890     !!!parse-error (type => 'empty element name'); ## TODO: type
4891     push @{$self->{ct}->{content}}, chr $self->{nc};
4892     $self->{group_depth}--;
4893     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4894     !!!next-input-character;
4895     redo A;
4896     } elsif ($self->{nc} == 0x003E) { # >
4897     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4898     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4899     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4900     !!!next-input-character;
4901     !!!emit ($self->{ct}); # ELEMENT
4902     redo A;
4903     } elsif ($self->{nc} == -1) {
4904     !!!parse-error (type => 'unclosed md'); ## TODO: type
4905     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4906     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4907     !!!next-input-character;
4908     !!!emit ($self->{ct}); # ELEMENT
4909     redo A;
4910     } else {
4911     push @{$self->{ct}->{content}}, chr $self->{nc};
4912     $self->{state} = CM_ELEMENT_NAME_STATE;
4913     !!!next-input-character;
4914     redo A;
4915     }
4916     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4917     if ($is_space->{$self->{nc}}) {
4918     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4919     !!!next-input-character;
4920     redo A;
4921     } elsif ($self->{nc} == 0x002A or # *
4922     $self->{nc} == 0x002B or # +
4923     $self->{nc} == 0x003F) { # ?
4924     push @{$self->{ct}->{content}}, chr $self->{nc};
4925     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4926     !!!next-input-character;
4927     redo A;
4928     } elsif ($self->{nc} == 0x007C or # |
4929     $self->{nc} == 0x002C) { # ,
4930     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4931     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4932     !!!next-input-character;
4933     redo A;
4934     } elsif ($self->{nc} == 0x0029) { # )
4935     $self->{group_depth}--;
4936     push @{$self->{ct}->{content}}, chr $self->{nc};
4937     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4938     !!!next-input-character;
4939     redo A;
4940     } elsif ($self->{nc} == 0x003E) { # >
4941     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4942     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4943     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4944     !!!next-input-character;
4945     !!!emit ($self->{ct}); # ELEMENT
4946     redo A;
4947     } elsif ($self->{nc} == -1) {
4948     !!!parse-error (type => 'unclosed md'); ## TODO: type
4949     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4950     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4951     !!!next-input-character;
4952     !!!emit ($self->{ct}); # ELEMENT
4953     redo A;
4954     } else {
4955     $self->{ct}->{content}->[-1] .= chr $self->{nc};
4956     ## Stay in the state.
4957     !!!next-input-character;
4958     redo A;
4959     }
4960     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4961     if ($is_space->{$self->{nc}}) {
4962     ## Stay in the state.
4963     !!!next-input-character;
4964     redo A;
4965     } elsif ($self->{nc} == 0x007C or # |
4966     $self->{nc} == 0x002C) { # ,
4967     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4968     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4969     !!!next-input-character;
4970     redo A;
4971     } elsif ($self->{nc} == 0x0029) { # )
4972     $self->{group_depth}--;
4973     push @{$self->{ct}->{content}}, chr $self->{nc};
4974     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4975     !!!next-input-character;
4976     redo A;
4977     } elsif ($self->{nc} == 0x003E) { # >
4978     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4979     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4981     !!!next-input-character;
4982     !!!emit ($self->{ct}); # ELEMENT
4983     redo A;
4984     } elsif ($self->{nc} == -1) {
4985     !!!parse-error (type => 'unclosed md'); ## TODO: type
4986     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4987     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4988     !!!next-input-character;
4989     !!!emit ($self->{ct}); # ELEMENT
4990     redo A;
4991     } else {
4992     !!!parse-error (type => 'after element name'); ## TODO: type
4993     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4994     $self->{state} = BOGUS_MD_STATE;
4995     !!!next-input-character;
4996     redo A;
4997     }
4998     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
4999     if ($is_space->{$self->{nc}}) {
5000     if ($self->{group_depth}) {
5001     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5002     } else {
5003     $self->{state} = AFTER_MD_DEF_STATE;
5004     }
5005     !!!next-input-character;
5006     redo A;
5007     } elsif ($self->{nc} == 0x002A or # *
5008     $self->{nc} == 0x002B or # +
5009     $self->{nc} == 0x003F) { # ?
5010     push @{$self->{ct}->{content}}, chr $self->{nc};
5011     if ($self->{group_depth}) {
5012     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5013     } else {
5014     $self->{state} = AFTER_MD_DEF_STATE;
5015     }
5016     !!!next-input-character;
5017     redo A;
5018     } elsif ($self->{nc} == 0x0029) { # )
5019     if ($self->{group_depth}) {
5020     $self->{group_depth}--;
5021     push @{$self->{ct}->{content}}, chr $self->{nc};
5022     ## Stay in the state.
5023     !!!next-input-character;
5024     redo A;
5025     } else {
5026     !!!parse-error (type => 'string after md def'); ## TODO: type
5027     $self->{state} = BOGUS_MD_STATE;
5028     ## Reconsume.
5029     redo A;
5030     }
5031     } elsif ($self->{nc} == 0x003E) { # >
5032     if ($self->{group_depth}) {
5033     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5034     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5035     }
5036     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5037     !!!next-input-character;
5038     !!!emit ($self->{ct}); # ELEMENT
5039     redo A;
5040     } elsif ($self->{nc} == -1) {
5041     !!!parse-error (type => 'unclosed md'); ## TODO: type
5042     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5043     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5044     !!!next-input-character;
5045     !!!emit ($self->{ct}); # ELEMENT
5046     redo A;
5047     } else {
5048     if ($self->{group_depth}) {
5049     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5050     } else {
5051     !!!parse-error (type => 'string after md def'); ## TODO: type
5052     $self->{state} = BOGUS_MD_STATE;
5053     }
5054     ## Reconsume.
5055     redo A;
5056     }
5057     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5058 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5059     ## Stay in the state.
5060     !!!next-input-character;
5061     redo A;
5062     } elsif ($self->{nc} == 0x003E) { # >
5063     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5064     !!!next-input-character;
5065 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5066 wakaba 1.18 redo A;
5067     } elsif ($self->{nc} == -1) {
5068     !!!parse-error (type => 'unclosed md'); ## TODO: type
5069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5070     !!!next-input-character;
5071 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5072 wakaba 1.18 redo A;
5073     } else {
5074 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5075 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5076     ## Reconsume.
5077     redo A;
5078     }
5079 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5080     if ($self->{nc} == 0x003E) { # >
5081     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5082     !!!next-input-character;
5083     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5084     redo A;
5085     } elsif ($self->{nc} == -1) {
5086     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5087     ## Reconsume.
5088     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5089     redo A;
5090     } else {
5091     ## Stay in the state.
5092     !!!next-input-character;
5093     redo A;
5094     }
5095 wakaba 1.1 } else {
5096     die "$0: $self->{state}: Unknown state";
5097     }
5098     } # A
5099    
5100     die "$0: _get_next_token: unexpected case";
5101     } # _get_next_token
5102    
5103     1;
5104 wakaba 1.22 ## $Date: 2008/10/19 09:25:21 $
5105 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24