/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.31 - (hide annotations) (download) (as text)
Sat Sep 5 09:26:55 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.30: +31 -12 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 09:26:39 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Added test cases for "comment end bang
	state" (HTML5 revision 3191).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:26:12 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end
	bang state" (HTML5 revision 3191).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.31 our $VERSION=do{my @r=(q$Revision: 1.30 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.31 sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
110     sub BOGUS_COMMENT_STATE () { 19 }
111     sub DOCTYPE_STATE () { 20 }
112     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
113     sub DOCTYPE_NAME_STATE () { 22 }
114     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
115     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
118     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
119     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
122     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
123     sub BOGUS_DOCTYPE_STATE () { 32 }
124     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
125     sub SELF_CLOSING_START_TAG_STATE () { 34 }
126     sub CDATA_SECTION_STATE () { 35 }
127     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
128     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
129     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
130     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
131     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
132     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
133     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
134     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
135     ## NOTE: "Entity data state", "entity in attribute value state", and
136     ## "consume a character reference" algorithm are jointly implemented
137     ## using the following six states:
138     sub ENTITY_STATE () { 44 }
139     sub ENTITY_HASH_STATE () { 45 }
140     sub NCR_NUM_STATE () { 46 }
141     sub HEXREF_X_STATE () { 47 }
142     sub HEXREF_HEX_STATE () { 48 }
143     sub ENTITY_NAME_STATE () { 49 }
144     sub PCDATA_STATE () { 50 } # "data state" in the spec
145    
146 wakaba 1.12 ## XML-only states
147 wakaba 1.8 sub PI_STATE () { 51 }
148     sub PI_TARGET_STATE () { 52 }
149     sub PI_TARGET_AFTER_STATE () { 53 }
150     sub PI_DATA_STATE () { 54 }
151     sub PI_AFTER_STATE () { 55 }
152     sub PI_DATA_AFTER_STATE () { 56 }
153 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
154     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
155 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
156     sub DOCTYPE_TAG_STATE () { 60 }
157     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
158     sub MD_ATTLIST_STATE () { 62 }
159     sub MD_E_STATE () { 63 }
160     sub MD_ELEMENT_STATE () { 64 }
161     sub MD_ENTITY_STATE () { 65 }
162     sub MD_NOTATION_STATE () { 66 }
163     sub DOCTYPE_MD_STATE () { 67 }
164     sub BEFORE_MD_NAME_STATE () { 68 }
165     sub MD_NAME_STATE () { 69 }
166     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
167     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
168 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
172     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
173     sub ALLOWED_TOKEN_STATE () { 77 }
174     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
175     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
176     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
180     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
181 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
182     sub NDATA_STATE () { 86 }
183     sub AFTER_NDATA_STATE () { 87 }
184     sub BEFORE_NOTATION_NAME_STATE () { 88 }
185     sub NOTATION_NAME_STATE () { 89 }
186 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
187     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
188     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
189     sub AFTER_ELEMENT_NAME_STATE () { 93 }
190     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
191     sub CONTENT_KEYWORD_STATE () { 95 }
192     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
193     sub CM_ELEMENT_NAME_STATE () { 97 }
194     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
195     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
196     sub AFTER_MD_DEF_STATE () { 100 }
197     sub BOGUS_MD_STATE () { 101 }
198 wakaba 1.8
199 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
200     ## list and descriptions)
201    
202     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
203     sub FOREIGN_EL () { 0b1_00000000000 }
204    
205     ## Character reference mappings
206    
207     my $charref_map = {
208     0x0D => 0x000A,
209     0x80 => 0x20AC,
210     0x81 => 0xFFFD,
211     0x82 => 0x201A,
212     0x83 => 0x0192,
213     0x84 => 0x201E,
214     0x85 => 0x2026,
215     0x86 => 0x2020,
216     0x87 => 0x2021,
217     0x88 => 0x02C6,
218     0x89 => 0x2030,
219     0x8A => 0x0160,
220     0x8B => 0x2039,
221     0x8C => 0x0152,
222     0x8D => 0xFFFD,
223     0x8E => 0x017D,
224     0x8F => 0xFFFD,
225     0x90 => 0xFFFD,
226     0x91 => 0x2018,
227     0x92 => 0x2019,
228     0x93 => 0x201C,
229     0x94 => 0x201D,
230     0x95 => 0x2022,
231     0x96 => 0x2013,
232     0x97 => 0x2014,
233     0x98 => 0x02DC,
234     0x99 => 0x2122,
235     0x9A => 0x0161,
236     0x9B => 0x203A,
237     0x9C => 0x0153,
238     0x9D => 0xFFFD,
239     0x9E => 0x017E,
240     0x9F => 0x0178,
241     }; # $charref_map
242     $charref_map->{$_} = 0xFFFD
243     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
244     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
245     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
246     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
247     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
248     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
249     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
250    
251     ## Implementations MUST act as if state machine in the spec
252    
253     sub _initialize_tokenizer ($) {
254     my $self = shift;
255    
256     ## NOTE: Fields set by |new| constructor:
257     #$self->{level}
258     #$self->{set_nc}
259     #$self->{parse_error}
260 wakaba 1.3 #$self->{is_xml} (if XML)
261 wakaba 1.1
262     $self->{state} = DATA_STATE; # MUST
263 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
264     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
265 wakaba 1.1 #$self->{entity__value}; # initialized when used
266     #$self->{entity__match}; # initialized when used
267     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
268     undef $self->{ct}; # current token
269     undef $self->{ca}; # current attribute
270     undef $self->{last_stag_name}; # last emitted start tag name
271     #$self->{prev_state}; # initialized when used
272     delete $self->{self_closing};
273     $self->{char_buffer} = '';
274     $self->{char_buffer_pos} = 0;
275     $self->{nc} = -1; # next input character
276     #$self->{next_nc}
277     !!!next-input-character;
278     $self->{token} = [];
279     # $self->{escape}
280     } # _initialize_tokenizer
281    
282     ## A token has:
283     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
284 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
285 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
286     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
287 wakaba 1.11 ## ->{target} (PI_TOKEN)
288 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
289     ## ->{sysid} (DOCTYPE_TOKEN)
290     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
291     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
292     ## ->{name}
293     ## ->{value}
294     ## ->{has_reference} == 1 or 0
295 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
296     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
297 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
298 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
299 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
300    
301 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
302     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
303     ## while the token is pushed back to the stack.
304    
305     ## Emitted token MUST immediately be handled by the tree construction state.
306    
307     ## Before each step, UA MAY check to see if either one of the scripts in
308     ## "list of scripts that will execute as soon as possible" or the first
309     ## script in the "list of scripts that will execute asynchronously",
310     ## has completed loading. If one has, then it MUST be executed
311     ## and removed from the list.
312    
313     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
314     ## (This requirement was dropped from HTML5 spec, unfortunately.)
315    
316     my $is_space = {
317     0x0009 => 1, # CHARACTER TABULATION (HT)
318     0x000A => 1, # LINE FEED (LF)
319     #0x000B => 0, # LINE TABULATION (VT)
320 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
321 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
322     0x0020 => 1, # SPACE (SP)
323     };
324    
325     sub _get_next_token ($) {
326     my $self = shift;
327    
328     if ($self->{self_closing}) {
329     !!!parse-error (type => 'nestc', token => $self->{ct});
330     ## NOTE: The |self_closing| flag is only set by start tag token.
331     ## In addition, when a start tag token is emitted, it is always set to
332     ## |ct|.
333     delete $self->{self_closing};
334     }
335    
336     if (@{$self->{token}}) {
337     $self->{self_closing} = $self->{token}->[0]->{self_closing};
338     return shift @{$self->{token}};
339     }
340    
341     A: {
342     if ($self->{state} == PCDATA_STATE) {
343     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
344    
345     if ($self->{nc} == 0x0026) { # &
346     !!!cp (0.1);
347     ## NOTE: In the spec, the tokenizer is switched to the
348     ## "entity data state". In this implementation, the tokenizer
349     ## is switched to the |ENTITY_STATE|, which is an implementation
350     ## of the "consume a character reference" algorithm.
351     $self->{entity_add} = -1;
352     $self->{prev_state} = DATA_STATE;
353     $self->{state} = ENTITY_STATE;
354     !!!next-input-character;
355     redo A;
356     } elsif ($self->{nc} == 0x003C) { # <
357     !!!cp (0.2);
358     $self->{state} = TAG_OPEN_STATE;
359     !!!next-input-character;
360     redo A;
361     } elsif ($self->{nc} == -1) {
362     !!!cp (0.3);
363     !!!emit ({type => END_OF_FILE_TOKEN,
364     line => $self->{line}, column => $self->{column}});
365     last A; ## TODO: ok?
366     } else {
367     !!!cp (0.4);
368     #
369     }
370    
371     # Anything else
372     my $token = {type => CHARACTER_TOKEN,
373     data => chr $self->{nc},
374     line => $self->{line}, column => $self->{column},
375     };
376     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
377    
378     ## Stay in the state.
379     !!!next-input-character;
380     !!!emit ($token);
381     redo A;
382     } elsif ($self->{state} == DATA_STATE) {
383     $self->{s_kwd} = '' unless defined $self->{s_kwd};
384     if ($self->{nc} == 0x0026) { # &
385     $self->{s_kwd} = '';
386     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
387     not $self->{escape}) {
388     !!!cp (1);
389     ## NOTE: In the spec, the tokenizer is switched to the
390     ## "entity data state". In this implementation, the tokenizer
391     ## is switched to the |ENTITY_STATE|, which is an implementation
392     ## of the "consume a character reference" algorithm.
393     $self->{entity_add} = -1;
394     $self->{prev_state} = DATA_STATE;
395     $self->{state} = ENTITY_STATE;
396     !!!next-input-character;
397     redo A;
398     } else {
399     !!!cp (2);
400     #
401     }
402     } elsif ($self->{nc} == 0x002D) { # -
403     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
404 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
405 wakaba 1.1 !!!cp (3);
406     $self->{escape} = 1; # unless $self->{escape};
407     $self->{s_kwd} = '--';
408     #
409 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
410 wakaba 1.1 !!!cp (4);
411     $self->{s_kwd} = '--';
412     #
413 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
414     !!!cp (4.1);
415     $self->{s_kwd} .= '-';
416     #
417 wakaba 1.1 } else {
418     !!!cp (5);
419 wakaba 1.5 $self->{s_kwd} = '-';
420 wakaba 1.1 #
421     }
422     }
423    
424     #
425     } elsif ($self->{nc} == 0x0021) { # !
426     if (length $self->{s_kwd}) {
427     !!!cp (5.1);
428     $self->{s_kwd} .= '!';
429     #
430     } else {
431     !!!cp (5.2);
432     #$self->{s_kwd} = '';
433     #
434     }
435     #
436     } elsif ($self->{nc} == 0x003C) { # <
437     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
438     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
439     not $self->{escape})) {
440     !!!cp (6);
441     $self->{state} = TAG_OPEN_STATE;
442     !!!next-input-character;
443     redo A;
444     } else {
445     !!!cp (7);
446     $self->{s_kwd} = '';
447     #
448     }
449     } elsif ($self->{nc} == 0x003E) { # >
450     if ($self->{escape} and
451     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
452     if ($self->{s_kwd} eq '--') {
453     !!!cp (8);
454     delete $self->{escape};
455 wakaba 1.5 #
456 wakaba 1.1 } else {
457     !!!cp (9);
458 wakaba 1.5 #
459 wakaba 1.1 }
460 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
461     !!!cp (9.1);
462     !!!parse-error (type => 'unmatched mse', ## TODO: type
463     line => $self->{line_prev},
464     column => $self->{column_prev} - 1);
465     #
466 wakaba 1.1 } else {
467     !!!cp (10);
468 wakaba 1.5 #
469 wakaba 1.1 }
470    
471     $self->{s_kwd} = '';
472     #
473 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
474     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
475     !!!cp (10.1);
476     $self->{s_kwd} .= ']';
477     } elsif ($self->{s_kwd} eq ']]') {
478     !!!cp (10.2);
479     #
480     } else {
481     !!!cp (10.3);
482     $self->{s_kwd} = '';
483     }
484     #
485 wakaba 1.1 } elsif ($self->{nc} == -1) {
486     !!!cp (11);
487     $self->{s_kwd} = '';
488     !!!emit ({type => END_OF_FILE_TOKEN,
489     line => $self->{line}, column => $self->{column}});
490     last A; ## TODO: ok?
491     } else {
492     !!!cp (12);
493     $self->{s_kwd} = '';
494     #
495     }
496    
497     # Anything else
498     my $token = {type => CHARACTER_TOKEN,
499     data => chr $self->{nc},
500     line => $self->{line}, column => $self->{column},
501     };
502 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
503 wakaba 1.1 length $token->{data})) {
504     $self->{s_kwd} = '';
505     }
506    
507     ## Stay in the data state.
508 wakaba 1.5 if (not $self->{is_xml} and
509     $self->{content_model} == PCDATA_CONTENT_MODEL) {
510 wakaba 1.1 !!!cp (13);
511     $self->{state} = PCDATA_STATE;
512     } else {
513     !!!cp (14);
514     ## Stay in the state.
515     }
516     !!!next-input-character;
517     !!!emit ($token);
518     redo A;
519     } elsif ($self->{state} == TAG_OPEN_STATE) {
520 wakaba 1.10 ## XML5: "tag state".
521    
522 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
523     if ($self->{nc} == 0x002F) { # /
524     !!!cp (15);
525     !!!next-input-character;
526     $self->{state} = CLOSE_TAG_OPEN_STATE;
527     redo A;
528     } elsif ($self->{nc} == 0x0021) { # !
529     !!!cp (15.1);
530 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
531 wakaba 1.1 #
532     } else {
533     !!!cp (16);
534 wakaba 1.12 $self->{s_kwd} = '';
535 wakaba 1.1 #
536     }
537    
538     ## reconsume
539     $self->{state} = DATA_STATE;
540     !!!emit ({type => CHARACTER_TOKEN, data => '<',
541     line => $self->{line_prev},
542     column => $self->{column_prev},
543     });
544     redo A;
545     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
546     if ($self->{nc} == 0x0021) { # !
547     !!!cp (17);
548     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
549     !!!next-input-character;
550     redo A;
551     } elsif ($self->{nc} == 0x002F) { # /
552     !!!cp (18);
553     $self->{state} = CLOSE_TAG_OPEN_STATE;
554     !!!next-input-character;
555     redo A;
556     } elsif (0x0041 <= $self->{nc} and
557     $self->{nc} <= 0x005A) { # A..Z
558     !!!cp (19);
559     $self->{ct}
560     = {type => START_TAG_TOKEN,
561 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
562 wakaba 1.1 line => $self->{line_prev},
563     column => $self->{column_prev}};
564     $self->{state} = TAG_NAME_STATE;
565     !!!next-input-character;
566     redo A;
567     } elsif (0x0061 <= $self->{nc} and
568     $self->{nc} <= 0x007A) { # a..z
569     !!!cp (20);
570     $self->{ct} = {type => START_TAG_TOKEN,
571     tag_name => chr ($self->{nc}),
572     line => $self->{line_prev},
573     column => $self->{column_prev}};
574     $self->{state} = TAG_NAME_STATE;
575     !!!next-input-character;
576     redo A;
577     } elsif ($self->{nc} == 0x003E) { # >
578     !!!cp (21);
579     !!!parse-error (type => 'empty start tag',
580     line => $self->{line_prev},
581     column => $self->{column_prev});
582     $self->{state} = DATA_STATE;
583 wakaba 1.5 $self->{s_kwd} = '';
584 wakaba 1.1 !!!next-input-character;
585    
586     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
587     line => $self->{line_prev},
588     column => $self->{column_prev},
589     });
590    
591     redo A;
592     } elsif ($self->{nc} == 0x003F) { # ?
593 wakaba 1.8 if ($self->{is_xml}) {
594     !!!cp (22.1);
595     $self->{state} = PI_STATE;
596     !!!next-input-character;
597     redo A;
598     } else {
599     !!!cp (22);
600     !!!parse-error (type => 'pio',
601     line => $self->{line_prev},
602     column => $self->{column_prev});
603     $self->{state} = BOGUS_COMMENT_STATE;
604     $self->{ct} = {type => COMMENT_TOKEN, data => '',
605     line => $self->{line_prev},
606     column => $self->{column_prev},
607     };
608     ## $self->{nc} is intentionally left as is
609     redo A;
610     }
611 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
612 wakaba 1.1 !!!cp (23);
613     !!!parse-error (type => 'bare stago',
614     line => $self->{line_prev},
615     column => $self->{column_prev});
616     $self->{state} = DATA_STATE;
617 wakaba 1.5 $self->{s_kwd} = '';
618 wakaba 1.1 ## reconsume
619    
620     !!!emit ({type => CHARACTER_TOKEN, data => '<',
621     line => $self->{line_prev},
622     column => $self->{column_prev},
623     });
624    
625     redo A;
626 wakaba 1.9 } else {
627     ## XML5: "<:" is a parse error.
628     !!!cp (23.1);
629     $self->{ct} = {type => START_TAG_TOKEN,
630     tag_name => chr ($self->{nc}),
631     line => $self->{line_prev},
632     column => $self->{column_prev}};
633     $self->{state} = TAG_NAME_STATE;
634     !!!next-input-character;
635     redo A;
636 wakaba 1.1 }
637     } else {
638     die "$0: $self->{content_model} in tag open";
639     }
640     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
641     ## NOTE: The "close tag open state" in the spec is implemented as
642     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
643    
644 wakaba 1.10 ## XML5: "end tag state".
645    
646 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
647     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
648     if (defined $self->{last_stag_name}) {
649     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
650 wakaba 1.12 $self->{kwd} = '';
651 wakaba 1.1 ## Reconsume.
652     redo A;
653     } else {
654     ## No start tag token has ever been emitted
655     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
656     !!!cp (28);
657     $self->{state} = DATA_STATE;
658 wakaba 1.5 $self->{s_kwd} = '';
659 wakaba 1.1 ## Reconsume.
660     !!!emit ({type => CHARACTER_TOKEN, data => '</',
661     line => $l, column => $c,
662     });
663     redo A;
664     }
665     }
666    
667     if (0x0041 <= $self->{nc} and
668     $self->{nc} <= 0x005A) { # A..Z
669     !!!cp (29);
670     $self->{ct}
671     = {type => END_TAG_TOKEN,
672 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
673 wakaba 1.1 line => $l, column => $c};
674     $self->{state} = TAG_NAME_STATE;
675     !!!next-input-character;
676     redo A;
677     } elsif (0x0061 <= $self->{nc} and
678     $self->{nc} <= 0x007A) { # a..z
679     !!!cp (30);
680     $self->{ct} = {type => END_TAG_TOKEN,
681     tag_name => chr ($self->{nc}),
682     line => $l, column => $c};
683     $self->{state} = TAG_NAME_STATE;
684     !!!next-input-character;
685     redo A;
686     } elsif ($self->{nc} == 0x003E) { # >
687     !!!parse-error (type => 'empty end tag',
688     line => $self->{line_prev}, ## "<" in "</>"
689     column => $self->{column_prev} - 1);
690     $self->{state} = DATA_STATE;
691 wakaba 1.5 $self->{s_kwd} = '';
692 wakaba 1.10 if ($self->{is_xml}) {
693     !!!cp (31);
694     ## XML5: No parse error.
695    
696     ## NOTE: This parser raises a parse error, since it supports
697     ## XML1, not XML5.
698    
699     ## NOTE: A short end tag token.
700     my $ct = {type => END_TAG_TOKEN,
701     tag_name => '',
702     line => $self->{line_prev},
703     column => $self->{column_prev} - 1,
704     };
705     !!!next-input-character;
706     !!!emit ($ct);
707     } else {
708     !!!cp (31.1);
709     !!!next-input-character;
710     }
711 wakaba 1.1 redo A;
712     } elsif ($self->{nc} == -1) {
713     !!!cp (32);
714     !!!parse-error (type => 'bare etago');
715 wakaba 1.5 $self->{s_kwd} = '';
716 wakaba 1.1 $self->{state} = DATA_STATE;
717     # reconsume
718    
719     !!!emit ({type => CHARACTER_TOKEN, data => '</',
720     line => $l, column => $c,
721     });
722    
723     redo A;
724 wakaba 1.10 } elsif (not $self->{is_xml} or
725     $is_space->{$self->{nc}}) {
726 wakaba 1.1 !!!cp (33);
727 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
728     line => $self->{line_prev}, # "<" of "</"
729     column => $self->{column_prev} - 1);
730 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
731     $self->{ct} = {type => COMMENT_TOKEN, data => '',
732     line => $self->{line_prev}, # "<" of "</"
733     column => $self->{column_prev} - 1,
734     };
735     ## NOTE: $self->{nc} is intentionally left as is.
736     ## Although the "anything else" case of the spec not explicitly
737     ## states that the next input character is to be reconsumed,
738     ## it will be included to the |data| of the comment token
739     ## generated from the bogus end tag, as defined in the
740     ## "bogus comment state" entry.
741     redo A;
742 wakaba 1.10 } else {
743     ## XML5: "</:" is a parse error.
744     !!!cp (30.1);
745     $self->{ct} = {type => END_TAG_TOKEN,
746     tag_name => chr ($self->{nc}),
747     line => $l, column => $c};
748     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
749     !!!next-input-character;
750     redo A;
751 wakaba 1.1 }
752     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
753 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
754 wakaba 1.1 if (length $ch) {
755     my $CH = $ch;
756     $ch =~ tr/a-z/A-Z/;
757     my $nch = chr $self->{nc};
758     if ($nch eq $ch or $nch eq $CH) {
759     !!!cp (24);
760     ## Stay in the state.
761 wakaba 1.12 $self->{kwd} .= $nch;
762 wakaba 1.1 !!!next-input-character;
763     redo A;
764     } else {
765     !!!cp (25);
766     $self->{state} = DATA_STATE;
767 wakaba 1.5 $self->{s_kwd} = '';
768 wakaba 1.1 ## Reconsume.
769     !!!emit ({type => CHARACTER_TOKEN,
770 wakaba 1.12 data => '</' . $self->{kwd},
771 wakaba 1.1 line => $self->{line_prev},
772 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
773 wakaba 1.1 });
774     redo A;
775     }
776     } else { # after "<{tag-name}"
777     unless ($is_space->{$self->{nc}} or
778     {
779     0x003E => 1, # >
780     0x002F => 1, # /
781     -1 => 1, # EOF
782     }->{$self->{nc}}) {
783     !!!cp (26);
784     ## Reconsume.
785     $self->{state} = DATA_STATE;
786 wakaba 1.5 $self->{s_kwd} = '';
787 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
788 wakaba 1.12 data => '</' . $self->{kwd},
789 wakaba 1.1 line => $self->{line_prev},
790 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
791 wakaba 1.1 });
792     redo A;
793     } else {
794     !!!cp (27);
795     $self->{ct}
796     = {type => END_TAG_TOKEN,
797     tag_name => $self->{last_stag_name},
798     line => $self->{line_prev},
799 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
800 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
801     ## Reconsume.
802     redo A;
803     }
804     }
805     } elsif ($self->{state} == TAG_NAME_STATE) {
806     if ($is_space->{$self->{nc}}) {
807     !!!cp (34);
808     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
809     !!!next-input-character;
810     redo A;
811     } elsif ($self->{nc} == 0x003E) { # >
812     if ($self->{ct}->{type} == START_TAG_TOKEN) {
813     !!!cp (35);
814     $self->{last_stag_name} = $self->{ct}->{tag_name};
815     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
816     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
817     #if ($self->{ct}->{attributes}) {
818     # ## NOTE: This should never be reached.
819     # !!! cp (36);
820     # !!! parse-error (type => 'end tag attribute');
821     #} else {
822     !!!cp (37);
823     #}
824     } else {
825     die "$0: $self->{ct}->{type}: Unknown token type";
826     }
827     $self->{state} = DATA_STATE;
828 wakaba 1.5 $self->{s_kwd} = '';
829 wakaba 1.1 !!!next-input-character;
830    
831     !!!emit ($self->{ct}); # start tag or end tag
832    
833     redo A;
834     } elsif (0x0041 <= $self->{nc} and
835     $self->{nc} <= 0x005A) { # A..Z
836     !!!cp (38);
837 wakaba 1.4 $self->{ct}->{tag_name}
838     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
839 wakaba 1.1 # start tag or end tag
840     ## Stay in this state
841     !!!next-input-character;
842     redo A;
843     } elsif ($self->{nc} == -1) {
844     !!!parse-error (type => 'unclosed tag');
845     if ($self->{ct}->{type} == START_TAG_TOKEN) {
846     !!!cp (39);
847     $self->{last_stag_name} = $self->{ct}->{tag_name};
848     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
849     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
850     #if ($self->{ct}->{attributes}) {
851     # ## NOTE: This state should never be reached.
852     # !!! cp (40);
853     # !!! parse-error (type => 'end tag attribute');
854     #} else {
855     !!!cp (41);
856     #}
857     } else {
858     die "$0: $self->{ct}->{type}: Unknown token type";
859     }
860     $self->{state} = DATA_STATE;
861 wakaba 1.5 $self->{s_kwd} = '';
862 wakaba 1.1 # reconsume
863    
864     !!!emit ($self->{ct}); # start tag or end tag
865    
866     redo A;
867     } elsif ($self->{nc} == 0x002F) { # /
868     !!!cp (42);
869     $self->{state} = SELF_CLOSING_START_TAG_STATE;
870     !!!next-input-character;
871     redo A;
872     } else {
873     !!!cp (44);
874     $self->{ct}->{tag_name} .= chr $self->{nc};
875     # start tag or end tag
876     ## Stay in the state
877     !!!next-input-character;
878     redo A;
879     }
880     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
881 wakaba 1.11 ## XML5: "Tag attribute name before state".
882    
883 wakaba 1.1 if ($is_space->{$self->{nc}}) {
884     !!!cp (45);
885     ## Stay in the state
886     !!!next-input-character;
887     redo A;
888     } elsif ($self->{nc} == 0x003E) { # >
889     if ($self->{ct}->{type} == START_TAG_TOKEN) {
890     !!!cp (46);
891     $self->{last_stag_name} = $self->{ct}->{tag_name};
892     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
893     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
894     if ($self->{ct}->{attributes}) {
895     !!!cp (47);
896     !!!parse-error (type => 'end tag attribute');
897     } else {
898     !!!cp (48);
899     }
900     } else {
901     die "$0: $self->{ct}->{type}: Unknown token type";
902     }
903     $self->{state} = DATA_STATE;
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 !!!next-input-character;
906    
907     !!!emit ($self->{ct}); # start tag or end tag
908    
909     redo A;
910     } elsif (0x0041 <= $self->{nc} and
911     $self->{nc} <= 0x005A) { # A..Z
912     !!!cp (49);
913     $self->{ca}
914 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
915 wakaba 1.1 value => '',
916     line => $self->{line}, column => $self->{column}};
917     $self->{state} = ATTRIBUTE_NAME_STATE;
918     !!!next-input-character;
919     redo A;
920     } elsif ($self->{nc} == 0x002F) { # /
921     !!!cp (50);
922     $self->{state} = SELF_CLOSING_START_TAG_STATE;
923     !!!next-input-character;
924     redo A;
925     } elsif ($self->{nc} == -1) {
926     !!!parse-error (type => 'unclosed tag');
927     if ($self->{ct}->{type} == START_TAG_TOKEN) {
928     !!!cp (52);
929     $self->{last_stag_name} = $self->{ct}->{tag_name};
930     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
931     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
932     if ($self->{ct}->{attributes}) {
933     !!!cp (53);
934     !!!parse-error (type => 'end tag attribute');
935     } else {
936     !!!cp (54);
937     }
938     } else {
939     die "$0: $self->{ct}->{type}: Unknown token type";
940     }
941     $self->{state} = DATA_STATE;
942 wakaba 1.5 $self->{s_kwd} = '';
943 wakaba 1.1 # reconsume
944    
945     !!!emit ($self->{ct}); # start tag or end tag
946    
947     redo A;
948     } else {
949     if ({
950     0x0022 => 1, # "
951     0x0027 => 1, # '
952 wakaba 1.30 0x003C => 1, # <
953 wakaba 1.1 0x003D => 1, # =
954     }->{$self->{nc}}) {
955     !!!cp (55);
956 wakaba 1.11 ## XML5: Not a parse error.
957 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
958     } else {
959     !!!cp (56);
960 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
961 wakaba 1.1 }
962     $self->{ca}
963     = {name => chr ($self->{nc}),
964     value => '',
965     line => $self->{line}, column => $self->{column}};
966     $self->{state} = ATTRIBUTE_NAME_STATE;
967     !!!next-input-character;
968     redo A;
969     }
970     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
971 wakaba 1.11 ## XML5: "Tag attribute name state".
972    
973 wakaba 1.1 my $before_leave = sub {
974     if (exists $self->{ct}->{attributes} # start tag or end tag
975     ->{$self->{ca}->{name}}) { # MUST
976     !!!cp (57);
977     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
978     ## Discard $self->{ca} # MUST
979     } else {
980     !!!cp (58);
981     $self->{ct}->{attributes}->{$self->{ca}->{name}}
982     = $self->{ca};
983 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
984 wakaba 1.1 }
985     }; # $before_leave
986    
987     if ($is_space->{$self->{nc}}) {
988     !!!cp (59);
989     $before_leave->();
990     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
991     !!!next-input-character;
992     redo A;
993     } elsif ($self->{nc} == 0x003D) { # =
994     !!!cp (60);
995     $before_leave->();
996     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
997     !!!next-input-character;
998     redo A;
999     } elsif ($self->{nc} == 0x003E) { # >
1000 wakaba 1.11 if ($self->{is_xml}) {
1001     !!!cp (60.1);
1002     ## XML5: Not a parse error.
1003     !!!parse-error (type => 'no attr value'); ## TODO: type
1004     } else {
1005     !!!cp (60.2);
1006     }
1007    
1008 wakaba 1.1 $before_leave->();
1009     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1010     !!!cp (61);
1011     $self->{last_stag_name} = $self->{ct}->{tag_name};
1012     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1013     !!!cp (62);
1014     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1015     if ($self->{ct}->{attributes}) {
1016     !!!parse-error (type => 'end tag attribute');
1017     }
1018     } else {
1019     die "$0: $self->{ct}->{type}: Unknown token type";
1020     }
1021     $self->{state} = DATA_STATE;
1022 wakaba 1.5 $self->{s_kwd} = '';
1023 wakaba 1.1 !!!next-input-character;
1024    
1025     !!!emit ($self->{ct}); # start tag or end tag
1026    
1027     redo A;
1028     } elsif (0x0041 <= $self->{nc} and
1029     $self->{nc} <= 0x005A) { # A..Z
1030     !!!cp (63);
1031 wakaba 1.4 $self->{ca}->{name}
1032     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1033 wakaba 1.1 ## Stay in the state
1034     !!!next-input-character;
1035     redo A;
1036     } elsif ($self->{nc} == 0x002F) { # /
1037 wakaba 1.11 if ($self->{is_xml}) {
1038     !!!cp (64);
1039     ## XML5: Not a parse error.
1040     !!!parse-error (type => 'no attr value'); ## TODO: type
1041     } else {
1042     !!!cp (64.1);
1043     }
1044    
1045 wakaba 1.1 $before_leave->();
1046     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1047     !!!next-input-character;
1048     redo A;
1049     } elsif ($self->{nc} == -1) {
1050     !!!parse-error (type => 'unclosed tag');
1051     $before_leave->();
1052     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1053     !!!cp (66);
1054     $self->{last_stag_name} = $self->{ct}->{tag_name};
1055     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1056     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1057     if ($self->{ct}->{attributes}) {
1058     !!!cp (67);
1059     !!!parse-error (type => 'end tag attribute');
1060     } else {
1061     ## NOTE: This state should never be reached.
1062     !!!cp (68);
1063     }
1064     } else {
1065     die "$0: $self->{ct}->{type}: Unknown token type";
1066     }
1067     $self->{state} = DATA_STATE;
1068 wakaba 1.5 $self->{s_kwd} = '';
1069 wakaba 1.1 # reconsume
1070    
1071     !!!emit ($self->{ct}); # start tag or end tag
1072    
1073     redo A;
1074     } else {
1075 wakaba 1.30 if ({
1076     0x0022 => 1, # "
1077     0x0027 => 1, # '
1078     0x003C => 1, # <
1079     }->{$self->{nc}}) {
1080 wakaba 1.1 !!!cp (69);
1081 wakaba 1.11 ## XML5: Not a parse error.
1082 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1083     } else {
1084     !!!cp (70);
1085     }
1086     $self->{ca}->{name} .= chr ($self->{nc});
1087     ## Stay in the state
1088     !!!next-input-character;
1089     redo A;
1090     }
1091     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1092 wakaba 1.11 ## XML5: "Tag attribute name after state".
1093    
1094 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1095     !!!cp (71);
1096     ## Stay in the state
1097     !!!next-input-character;
1098     redo A;
1099     } elsif ($self->{nc} == 0x003D) { # =
1100     !!!cp (72);
1101     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1102     !!!next-input-character;
1103     redo A;
1104     } elsif ($self->{nc} == 0x003E) { # >
1105 wakaba 1.11 if ($self->{is_xml}) {
1106     !!!cp (72.1);
1107     ## XML5: Not a parse error.
1108     !!!parse-error (type => 'no attr value'); ## TODO: type
1109     } else {
1110     !!!cp (72.2);
1111     }
1112    
1113 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1114     !!!cp (73);
1115     $self->{last_stag_name} = $self->{ct}->{tag_name};
1116     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1117     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1118     if ($self->{ct}->{attributes}) {
1119     !!!cp (74);
1120     !!!parse-error (type => 'end tag attribute');
1121     } else {
1122     ## NOTE: This state should never be reached.
1123     !!!cp (75);
1124     }
1125     } else {
1126     die "$0: $self->{ct}->{type}: Unknown token type";
1127     }
1128     $self->{state} = DATA_STATE;
1129 wakaba 1.5 $self->{s_kwd} = '';
1130 wakaba 1.1 !!!next-input-character;
1131    
1132     !!!emit ($self->{ct}); # start tag or end tag
1133    
1134     redo A;
1135     } elsif (0x0041 <= $self->{nc} and
1136     $self->{nc} <= 0x005A) { # A..Z
1137     !!!cp (76);
1138     $self->{ca}
1139 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1140 wakaba 1.1 value => '',
1141     line => $self->{line}, column => $self->{column}};
1142     $self->{state} = ATTRIBUTE_NAME_STATE;
1143     !!!next-input-character;
1144     redo A;
1145     } elsif ($self->{nc} == 0x002F) { # /
1146 wakaba 1.11 if ($self->{is_xml}) {
1147     !!!cp (77);
1148     ## XML5: Not a parse error.
1149     !!!parse-error (type => 'no attr value'); ## TODO: type
1150     } else {
1151     !!!cp (77.1);
1152     }
1153    
1154 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1155     !!!next-input-character;
1156     redo A;
1157     } elsif ($self->{nc} == -1) {
1158     !!!parse-error (type => 'unclosed tag');
1159     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160     !!!cp (79);
1161     $self->{last_stag_name} = $self->{ct}->{tag_name};
1162     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164     if ($self->{ct}->{attributes}) {
1165     !!!cp (80);
1166     !!!parse-error (type => 'end tag attribute');
1167     } else {
1168     ## NOTE: This state should never be reached.
1169     !!!cp (81);
1170     }
1171     } else {
1172     die "$0: $self->{ct}->{type}: Unknown token type";
1173     }
1174 wakaba 1.5 $self->{s_kwd} = '';
1175 wakaba 1.1 $self->{state} = DATA_STATE;
1176     # reconsume
1177    
1178     !!!emit ($self->{ct}); # start tag or end tag
1179    
1180     redo A;
1181     } else {
1182 wakaba 1.11 if ($self->{is_xml}) {
1183     !!!cp (78.1);
1184     ## XML5: Not a parse error.
1185     !!!parse-error (type => 'no attr value'); ## TODO: type
1186     } else {
1187     !!!cp (78.2);
1188     }
1189    
1190 wakaba 1.30 if ({
1191     0x0022 => 1, # "
1192     0x0027 => 1, # '
1193     0x003C => 1, # <
1194     }->{$self->{nc}}) {
1195 wakaba 1.1 !!!cp (78);
1196 wakaba 1.11 ## XML5: Not a parse error.
1197 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1198     } else {
1199     !!!cp (82);
1200     }
1201     $self->{ca}
1202     = {name => chr ($self->{nc}),
1203     value => '',
1204     line => $self->{line}, column => $self->{column}};
1205     $self->{state} = ATTRIBUTE_NAME_STATE;
1206     !!!next-input-character;
1207     redo A;
1208     }
1209     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1210 wakaba 1.11 ## XML5: "Tag attribute value before state".
1211    
1212 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1213     !!!cp (83);
1214     ## Stay in the state
1215     !!!next-input-character;
1216     redo A;
1217     } elsif ($self->{nc} == 0x0022) { # "
1218     !!!cp (84);
1219     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1220     !!!next-input-character;
1221     redo A;
1222     } elsif ($self->{nc} == 0x0026) { # &
1223     !!!cp (85);
1224     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1225     ## reconsume
1226     redo A;
1227     } elsif ($self->{nc} == 0x0027) { # '
1228     !!!cp (86);
1229     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1230     !!!next-input-character;
1231     redo A;
1232     } elsif ($self->{nc} == 0x003E) { # >
1233     !!!parse-error (type => 'empty unquoted attribute value');
1234     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1235     !!!cp (87);
1236     $self->{last_stag_name} = $self->{ct}->{tag_name};
1237     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1238     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1239     if ($self->{ct}->{attributes}) {
1240     !!!cp (88);
1241     !!!parse-error (type => 'end tag attribute');
1242     } else {
1243     ## NOTE: This state should never be reached.
1244     !!!cp (89);
1245     }
1246     } else {
1247     die "$0: $self->{ct}->{type}: Unknown token type";
1248     }
1249     $self->{state} = DATA_STATE;
1250 wakaba 1.5 $self->{s_kwd} = '';
1251 wakaba 1.1 !!!next-input-character;
1252    
1253     !!!emit ($self->{ct}); # start tag or end tag
1254    
1255     redo A;
1256     } elsif ($self->{nc} == -1) {
1257     !!!parse-error (type => 'unclosed tag');
1258     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1259     !!!cp (90);
1260     $self->{last_stag_name} = $self->{ct}->{tag_name};
1261     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1262     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1263     if ($self->{ct}->{attributes}) {
1264     !!!cp (91);
1265     !!!parse-error (type => 'end tag attribute');
1266     } else {
1267     ## NOTE: This state should never be reached.
1268     !!!cp (92);
1269     }
1270     } else {
1271     die "$0: $self->{ct}->{type}: Unknown token type";
1272     }
1273     $self->{state} = DATA_STATE;
1274 wakaba 1.5 $self->{s_kwd} = '';
1275 wakaba 1.1 ## reconsume
1276    
1277     !!!emit ($self->{ct}); # start tag or end tag
1278    
1279     redo A;
1280     } else {
1281 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1282 wakaba 1.1 !!!cp (93);
1283 wakaba 1.11 ## XML5: Not a parse error.
1284 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1285 wakaba 1.11 } elsif ($self->{is_xml}) {
1286     !!!cp (93.1);
1287     ## XML5: No parse error.
1288     !!!parse-error (type => 'unquoted attr value'); ## TODO
1289 wakaba 1.1 } else {
1290     !!!cp (94);
1291     }
1292     $self->{ca}->{value} .= chr ($self->{nc});
1293     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1294     !!!next-input-character;
1295     redo A;
1296     }
1297     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1298 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1299     ## ATTLIST attribute value double quoted state".
1300 wakaba 1.11
1301 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1302 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1303     !!!cp (95.1);
1304     ## XML5: "DOCTYPE ATTLIST name after state".
1305     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1306     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1307     } else {
1308     !!!cp (95);
1309     ## XML5: "Tag attribute name before state".
1310     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1311     }
1312 wakaba 1.1 !!!next-input-character;
1313     redo A;
1314     } elsif ($self->{nc} == 0x0026) { # &
1315     !!!cp (96);
1316 wakaba 1.11 ## XML5: Not defined yet.
1317    
1318 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1319     ## "entity in attribute value state". In this implementation, the
1320     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1321     ## implementation of the "consume a character reference" algorithm.
1322     $self->{prev_state} = $self->{state};
1323     $self->{entity_add} = 0x0022; # "
1324     $self->{state} = ENTITY_STATE;
1325     !!!next-input-character;
1326     redo A;
1327 wakaba 1.25 } elsif ($self->{is_xml} and
1328     $is_space->{$self->{nc}}) {
1329     !!!cp (97.1);
1330     $self->{ca}->{value} .= ' ';
1331     ## Stay in the state.
1332     !!!next-input-character;
1333     redo A;
1334 wakaba 1.1 } elsif ($self->{nc} == -1) {
1335     !!!parse-error (type => 'unclosed attribute value');
1336     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1337     !!!cp (97);
1338     $self->{last_stag_name} = $self->{ct}->{tag_name};
1339 wakaba 1.15
1340     $self->{state} = DATA_STATE;
1341     $self->{s_kwd} = '';
1342     ## reconsume
1343     !!!emit ($self->{ct}); # start tag
1344     redo A;
1345 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1346     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1347     if ($self->{ct}->{attributes}) {
1348     !!!cp (98);
1349     !!!parse-error (type => 'end tag attribute');
1350     } else {
1351     ## NOTE: This state should never be reached.
1352     !!!cp (99);
1353     }
1354 wakaba 1.15
1355     $self->{state} = DATA_STATE;
1356     $self->{s_kwd} = '';
1357     ## reconsume
1358     !!!emit ($self->{ct}); # end tag
1359     redo A;
1360     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1361     ## XML5: No parse error above; not defined yet.
1362     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1364     ## Reconsume.
1365     !!!emit ($self->{ct}); # ATTLIST
1366     redo A;
1367 wakaba 1.1 } else {
1368     die "$0: $self->{ct}->{type}: Unknown token type";
1369     }
1370     } else {
1371 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1372 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1373     !!!cp (100);
1374     ## XML5: Not a parse error.
1375     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1376     } else {
1377     !!!cp (100.1);
1378     }
1379 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1380     $self->{read_until}->($self->{ca}->{value},
1381 wakaba 1.25 qq["&<\x09\x0C\x20],
1382 wakaba 1.1 length $self->{ca}->{value});
1383    
1384     ## Stay in the state
1385     !!!next-input-character;
1386     redo A;
1387     }
1388     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1389 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1390     ## ATTLIST attribute value single quoted state".
1391 wakaba 1.11
1392 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1393 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1394     !!!cp (101.1);
1395     ## XML5: "DOCTYPE ATTLIST name after state".
1396     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1397     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1398     } else {
1399     !!!cp (101);
1400     ## XML5: "Before attribute name state" (sic).
1401     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1402     }
1403 wakaba 1.1 !!!next-input-character;
1404     redo A;
1405     } elsif ($self->{nc} == 0x0026) { # &
1406     !!!cp (102);
1407 wakaba 1.11 ## XML5: Not defined yet.
1408    
1409 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1410     ## "entity in attribute value state". In this implementation, the
1411     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1412     ## implementation of the "consume a character reference" algorithm.
1413     $self->{entity_add} = 0x0027; # '
1414     $self->{prev_state} = $self->{state};
1415     $self->{state} = ENTITY_STATE;
1416     !!!next-input-character;
1417     redo A;
1418 wakaba 1.25 } elsif ($self->{is_xml} and
1419     $is_space->{$self->{nc}}) {
1420     !!!cp (103.1);
1421     $self->{ca}->{value} .= ' ';
1422     ## Stay in the state.
1423     !!!next-input-character;
1424     redo A;
1425 wakaba 1.1 } elsif ($self->{nc} == -1) {
1426     !!!parse-error (type => 'unclosed attribute value');
1427     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1428     !!!cp (103);
1429     $self->{last_stag_name} = $self->{ct}->{tag_name};
1430 wakaba 1.15
1431     $self->{state} = DATA_STATE;
1432     $self->{s_kwd} = '';
1433     ## reconsume
1434     !!!emit ($self->{ct}); # start tag
1435     redo A;
1436 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1437     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438     if ($self->{ct}->{attributes}) {
1439     !!!cp (104);
1440     !!!parse-error (type => 'end tag attribute');
1441     } else {
1442     ## NOTE: This state should never be reached.
1443     !!!cp (105);
1444     }
1445 wakaba 1.15
1446     $self->{state} = DATA_STATE;
1447     $self->{s_kwd} = '';
1448     ## reconsume
1449     !!!emit ($self->{ct}); # end tag
1450     redo A;
1451     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1452     ## XML5: No parse error above; not defined yet.
1453     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1454     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1455     ## Reconsume.
1456     !!!emit ($self->{ct}); # ATTLIST
1457     redo A;
1458 wakaba 1.1 } else {
1459     die "$0: $self->{ct}->{type}: Unknown token type";
1460     }
1461     } else {
1462 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1463 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1464     !!!cp (106);
1465     ## XML5: Not a parse error.
1466     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1467     } else {
1468     !!!cp (106.1);
1469     }
1470 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1471     $self->{read_until}->($self->{ca}->{value},
1472 wakaba 1.25 qq['&<\x09\x0C\x20],
1473 wakaba 1.1 length $self->{ca}->{value});
1474    
1475     ## Stay in the state
1476     !!!next-input-character;
1477     redo A;
1478     }
1479     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1480 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1481    
1482 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1483 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1484     !!!cp (107.1);
1485     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1486     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1487     } else {
1488     !!!cp (107);
1489     ## XML5: "Tag attribute name before state".
1490     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1491     }
1492 wakaba 1.1 !!!next-input-character;
1493     redo A;
1494     } elsif ($self->{nc} == 0x0026) { # &
1495     !!!cp (108);
1496 wakaba 1.11
1497     ## XML5: Not defined yet.
1498    
1499 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1500     ## "entity in attribute value state". In this implementation, the
1501     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1502     ## implementation of the "consume a character reference" algorithm.
1503     $self->{entity_add} = -1;
1504     $self->{prev_state} = $self->{state};
1505     $self->{state} = ENTITY_STATE;
1506     !!!next-input-character;
1507     redo A;
1508     } elsif ($self->{nc} == 0x003E) { # >
1509     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1510     !!!cp (109);
1511     $self->{last_stag_name} = $self->{ct}->{tag_name};
1512 wakaba 1.15
1513     $self->{state} = DATA_STATE;
1514     $self->{s_kwd} = '';
1515     !!!next-input-character;
1516     !!!emit ($self->{ct}); # start tag
1517     redo A;
1518 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1519     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1520     if ($self->{ct}->{attributes}) {
1521     !!!cp (110);
1522     !!!parse-error (type => 'end tag attribute');
1523     } else {
1524     ## NOTE: This state should never be reached.
1525     !!!cp (111);
1526     }
1527 wakaba 1.15
1528     $self->{state} = DATA_STATE;
1529     $self->{s_kwd} = '';
1530     !!!next-input-character;
1531     !!!emit ($self->{ct}); # end tag
1532     redo A;
1533     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1534     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1535     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1536     !!!next-input-character;
1537     !!!emit ($self->{ct}); # ATTLIST
1538     redo A;
1539 wakaba 1.1 } else {
1540     die "$0: $self->{ct}->{type}: Unknown token type";
1541     }
1542     } elsif ($self->{nc} == -1) {
1543     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1544     !!!cp (112);
1545 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1546 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1547 wakaba 1.15
1548     $self->{state} = DATA_STATE;
1549     $self->{s_kwd} = '';
1550     ## reconsume
1551     !!!emit ($self->{ct}); # start tag
1552     redo A;
1553 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1554 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1555 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1556     if ($self->{ct}->{attributes}) {
1557     !!!cp (113);
1558     !!!parse-error (type => 'end tag attribute');
1559     } else {
1560     ## NOTE: This state should never be reached.
1561     !!!cp (114);
1562     }
1563 wakaba 1.15
1564     $self->{state} = DATA_STATE;
1565     $self->{s_kwd} = '';
1566     ## reconsume
1567     !!!emit ($self->{ct}); # end tag
1568     redo A;
1569     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1570     !!!parse-error (type => 'unclosed md'); ## TODO: type
1571     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1572     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1573     ## Reconsume.
1574     !!!emit ($self->{ct}); # ATTLIST
1575     redo A;
1576 wakaba 1.1 } else {
1577     die "$0: $self->{ct}->{type}: Unknown token type";
1578     }
1579     } else {
1580     if ({
1581     0x0022 => 1, # "
1582     0x0027 => 1, # '
1583     0x003D => 1, # =
1584 wakaba 1.26 0x003C => 1, # <
1585 wakaba 1.1 }->{$self->{nc}}) {
1586     !!!cp (115);
1587 wakaba 1.11 ## XML5: Not a parse error.
1588 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1589     } else {
1590     !!!cp (116);
1591     }
1592     $self->{ca}->{value} .= chr ($self->{nc});
1593     $self->{read_until}->($self->{ca}->{value},
1594 wakaba 1.25 qq["'=& \x09\x0C>],
1595 wakaba 1.1 length $self->{ca}->{value});
1596    
1597     ## Stay in the state
1598     !!!next-input-character;
1599     redo A;
1600     }
1601     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1602     if ($is_space->{$self->{nc}}) {
1603     !!!cp (118);
1604     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1605     !!!next-input-character;
1606     redo A;
1607     } elsif ($self->{nc} == 0x003E) { # >
1608     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1609     !!!cp (119);
1610     $self->{last_stag_name} = $self->{ct}->{tag_name};
1611     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1612     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1613     if ($self->{ct}->{attributes}) {
1614     !!!cp (120);
1615     !!!parse-error (type => 'end tag attribute');
1616     } else {
1617     ## NOTE: This state should never be reached.
1618     !!!cp (121);
1619     }
1620     } else {
1621     die "$0: $self->{ct}->{type}: Unknown token type";
1622     }
1623     $self->{state} = DATA_STATE;
1624 wakaba 1.5 $self->{s_kwd} = '';
1625 wakaba 1.1 !!!next-input-character;
1626    
1627     !!!emit ($self->{ct}); # start tag or end tag
1628    
1629     redo A;
1630     } elsif ($self->{nc} == 0x002F) { # /
1631     !!!cp (122);
1632     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1633     !!!next-input-character;
1634     redo A;
1635     } elsif ($self->{nc} == -1) {
1636     !!!parse-error (type => 'unclosed tag');
1637     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1638     !!!cp (122.3);
1639     $self->{last_stag_name} = $self->{ct}->{tag_name};
1640     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1641     if ($self->{ct}->{attributes}) {
1642     !!!cp (122.1);
1643     !!!parse-error (type => 'end tag attribute');
1644     } else {
1645     ## NOTE: This state should never be reached.
1646     !!!cp (122.2);
1647     }
1648     } else {
1649     die "$0: $self->{ct}->{type}: Unknown token type";
1650     }
1651     $self->{state} = DATA_STATE;
1652 wakaba 1.5 $self->{s_kwd} = '';
1653 wakaba 1.1 ## Reconsume.
1654     !!!emit ($self->{ct}); # start tag or end tag
1655     redo A;
1656     } else {
1657     !!!cp ('124.1');
1658     !!!parse-error (type => 'no space between attributes');
1659     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1660     ## reconsume
1661     redo A;
1662     }
1663     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1664 wakaba 1.11 ## XML5: "Empty tag state".
1665    
1666 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1667     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1668     !!!cp ('124.2');
1669     !!!parse-error (type => 'nestc', token => $self->{ct});
1670     ## TODO: Different type than slash in start tag
1671     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1672     if ($self->{ct}->{attributes}) {
1673     !!!cp ('124.4');
1674     !!!parse-error (type => 'end tag attribute');
1675     } else {
1676     !!!cp ('124.5');
1677     }
1678     ## TODO: Test |<title></title/>|
1679     } else {
1680     !!!cp ('124.3');
1681     $self->{self_closing} = 1;
1682     }
1683    
1684     $self->{state} = DATA_STATE;
1685 wakaba 1.5 $self->{s_kwd} = '';
1686 wakaba 1.1 !!!next-input-character;
1687    
1688     !!!emit ($self->{ct}); # start tag or end tag
1689    
1690     redo A;
1691     } elsif ($self->{nc} == -1) {
1692     !!!parse-error (type => 'unclosed tag');
1693     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1694     !!!cp (124.7);
1695     $self->{last_stag_name} = $self->{ct}->{tag_name};
1696     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1697     if ($self->{ct}->{attributes}) {
1698     !!!cp (124.5);
1699     !!!parse-error (type => 'end tag attribute');
1700     } else {
1701     ## NOTE: This state should never be reached.
1702     !!!cp (124.6);
1703     }
1704     } else {
1705     die "$0: $self->{ct}->{type}: Unknown token type";
1706     }
1707 wakaba 1.11 ## XML5: "Tag attribute name before state".
1708 wakaba 1.1 $self->{state} = DATA_STATE;
1709 wakaba 1.5 $self->{s_kwd} = '';
1710 wakaba 1.1 ## Reconsume.
1711     !!!emit ($self->{ct}); # start tag or end tag
1712     redo A;
1713     } else {
1714     !!!cp ('124.4');
1715     !!!parse-error (type => 'nestc');
1716     ## TODO: This error type is wrong.
1717     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1718     ## Reconsume.
1719     redo A;
1720     }
1721     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1722 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1723    
1724 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1725     ## consumes characters one-by-one basis.
1726    
1727     if ($self->{nc} == 0x003E) { # >
1728 wakaba 1.13 if ($self->{in_subset}) {
1729     !!!cp (123);
1730     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1731     } else {
1732     !!!cp (124);
1733     $self->{state} = DATA_STATE;
1734     $self->{s_kwd} = '';
1735     }
1736 wakaba 1.1 !!!next-input-character;
1737    
1738     !!!emit ($self->{ct}); # comment
1739     redo A;
1740     } elsif ($self->{nc} == -1) {
1741 wakaba 1.13 if ($self->{in_subset}) {
1742     !!!cp (125.1);
1743     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1744     } else {
1745     !!!cp (125);
1746     $self->{state} = DATA_STATE;
1747     $self->{s_kwd} = '';
1748     }
1749 wakaba 1.1 ## reconsume
1750    
1751     !!!emit ($self->{ct}); # comment
1752     redo A;
1753     } else {
1754     !!!cp (126);
1755     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1756     $self->{read_until}->($self->{ct}->{data},
1757     q[>],
1758     length $self->{ct}->{data});
1759    
1760     ## Stay in the state.
1761     !!!next-input-character;
1762     redo A;
1763     }
1764     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1765 wakaba 1.14 ## XML5: "Markup declaration state".
1766 wakaba 1.1
1767     if ($self->{nc} == 0x002D) { # -
1768     !!!cp (133);
1769     $self->{state} = MD_HYPHEN_STATE;
1770     !!!next-input-character;
1771     redo A;
1772     } elsif ($self->{nc} == 0x0044 or # D
1773     $self->{nc} == 0x0064) { # d
1774     ## ASCII case-insensitive.
1775     !!!cp (130);
1776     $self->{state} = MD_DOCTYPE_STATE;
1777 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1778 wakaba 1.1 !!!next-input-character;
1779     redo A;
1780 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1781     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1782     $self->{is_xml}) and
1783 wakaba 1.1 $self->{nc} == 0x005B) { # [
1784     !!!cp (135.4);
1785     $self->{state} = MD_CDATA_STATE;
1786 wakaba 1.12 $self->{kwd} = '[';
1787 wakaba 1.1 !!!next-input-character;
1788     redo A;
1789     } else {
1790     !!!cp (136);
1791     }
1792    
1793     !!!parse-error (type => 'bogus comment',
1794     line => $self->{line_prev},
1795     column => $self->{column_prev} - 1);
1796     ## Reconsume.
1797     $self->{state} = BOGUS_COMMENT_STATE;
1798     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799     line => $self->{line_prev},
1800     column => $self->{column_prev} - 1,
1801     };
1802     redo A;
1803     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1804     if ($self->{nc} == 0x002D) { # -
1805     !!!cp (127);
1806     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1807     line => $self->{line_prev},
1808     column => $self->{column_prev} - 2,
1809     };
1810 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1811 wakaba 1.1 !!!next-input-character;
1812     redo A;
1813     } else {
1814     !!!cp (128);
1815     !!!parse-error (type => 'bogus comment',
1816     line => $self->{line_prev},
1817     column => $self->{column_prev} - 2);
1818     $self->{state} = BOGUS_COMMENT_STATE;
1819     ## Reconsume.
1820     $self->{ct} = {type => COMMENT_TOKEN,
1821     data => '-',
1822     line => $self->{line_prev},
1823     column => $self->{column_prev} - 2,
1824     };
1825     redo A;
1826     }
1827     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1828     ## ASCII case-insensitive.
1829     if ($self->{nc} == [
1830     undef,
1831     0x004F, # O
1832     0x0043, # C
1833     0x0054, # T
1834     0x0059, # Y
1835     0x0050, # P
1836 wakaba 1.12 ]->[length $self->{kwd}] or
1837 wakaba 1.1 $self->{nc} == [
1838     undef,
1839     0x006F, # o
1840     0x0063, # c
1841     0x0074, # t
1842     0x0079, # y
1843     0x0070, # p
1844 wakaba 1.12 ]->[length $self->{kwd}]) {
1845 wakaba 1.1 !!!cp (131);
1846     ## Stay in the state.
1847 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1848 wakaba 1.1 !!!next-input-character;
1849     redo A;
1850 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1851 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1852     $self->{nc} == 0x0065)) { # e
1853 wakaba 1.12 if ($self->{is_xml} and
1854     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1855 wakaba 1.10 !!!cp (129);
1856     ## XML5: case-sensitive.
1857     !!!parse-error (type => 'lowercase keyword', ## TODO
1858     text => 'DOCTYPE',
1859     line => $self->{line_prev},
1860     column => $self->{column_prev} - 5);
1861     } else {
1862     !!!cp (129.1);
1863     }
1864 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1865     $self->{ct} = {type => DOCTYPE_TOKEN,
1866     quirks => 1,
1867     line => $self->{line_prev},
1868     column => $self->{column_prev} - 7,
1869     };
1870     !!!next-input-character;
1871     redo A;
1872     } else {
1873     !!!cp (132);
1874     !!!parse-error (type => 'bogus comment',
1875     line => $self->{line_prev},
1876 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1877 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1878     ## Reconsume.
1879     $self->{ct} = {type => COMMENT_TOKEN,
1880 wakaba 1.12 data => $self->{kwd},
1881 wakaba 1.1 line => $self->{line_prev},
1882 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1883 wakaba 1.1 };
1884     redo A;
1885     }
1886     } elsif ($self->{state} == MD_CDATA_STATE) {
1887     if ($self->{nc} == {
1888     '[' => 0x0043, # C
1889     '[C' => 0x0044, # D
1890     '[CD' => 0x0041, # A
1891     '[CDA' => 0x0054, # T
1892     '[CDAT' => 0x0041, # A
1893 wakaba 1.12 }->{$self->{kwd}}) {
1894 wakaba 1.1 !!!cp (135.1);
1895     ## Stay in the state.
1896 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1897 wakaba 1.1 !!!next-input-character;
1898     redo A;
1899 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1900 wakaba 1.1 $self->{nc} == 0x005B) { # [
1901 wakaba 1.6 if ($self->{is_xml} and
1902     not $self->{tainted} and
1903     @{$self->{open_elements} or []} == 0) {
1904 wakaba 1.8 !!!cp (135.2);
1905 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1906     line => $self->{line_prev},
1907     column => $self->{column_prev} - 7);
1908     $self->{tainted} = 1;
1909 wakaba 1.8 } else {
1910     !!!cp (135.21);
1911 wakaba 1.6 }
1912    
1913 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1914     data => '',
1915     line => $self->{line_prev},
1916     column => $self->{column_prev} - 7};
1917     $self->{state} = CDATA_SECTION_STATE;
1918     !!!next-input-character;
1919     redo A;
1920     } else {
1921     !!!cp (135.3);
1922     !!!parse-error (type => 'bogus comment',
1923     line => $self->{line_prev},
1924 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1925 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1926     ## Reconsume.
1927     $self->{ct} = {type => COMMENT_TOKEN,
1928 wakaba 1.12 data => $self->{kwd},
1929 wakaba 1.1 line => $self->{line_prev},
1930 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1931 wakaba 1.1 };
1932     redo A;
1933     }
1934     } elsif ($self->{state} == COMMENT_START_STATE) {
1935     if ($self->{nc} == 0x002D) { # -
1936     !!!cp (137);
1937     $self->{state} = COMMENT_START_DASH_STATE;
1938     !!!next-input-character;
1939     redo A;
1940     } elsif ($self->{nc} == 0x003E) { # >
1941     !!!parse-error (type => 'bogus comment');
1942 wakaba 1.13 if ($self->{in_subset}) {
1943     !!!cp (138.1);
1944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1945     } else {
1946     !!!cp (138);
1947     $self->{state} = DATA_STATE;
1948     $self->{s_kwd} = '';
1949     }
1950 wakaba 1.1 !!!next-input-character;
1951    
1952     !!!emit ($self->{ct}); # comment
1953    
1954     redo A;
1955     } elsif ($self->{nc} == -1) {
1956     !!!parse-error (type => 'unclosed comment');
1957 wakaba 1.13 if ($self->{in_subset}) {
1958     !!!cp (139.1);
1959     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1960     } else {
1961     !!!cp (139);
1962     $self->{state} = DATA_STATE;
1963     $self->{s_kwd} = '';
1964     }
1965 wakaba 1.1 ## reconsume
1966    
1967     !!!emit ($self->{ct}); # comment
1968    
1969     redo A;
1970     } else {
1971     !!!cp (140);
1972     $self->{ct}->{data} # comment
1973     .= chr ($self->{nc});
1974     $self->{state} = COMMENT_STATE;
1975     !!!next-input-character;
1976     redo A;
1977     }
1978     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1979     if ($self->{nc} == 0x002D) { # -
1980     !!!cp (141);
1981     $self->{state} = COMMENT_END_STATE;
1982     !!!next-input-character;
1983     redo A;
1984     } elsif ($self->{nc} == 0x003E) { # >
1985     !!!parse-error (type => 'bogus comment');
1986 wakaba 1.13 if ($self->{in_subset}) {
1987     !!!cp (142.1);
1988     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1989     } else {
1990     !!!cp (142);
1991     $self->{state} = DATA_STATE;
1992     $self->{s_kwd} = '';
1993     }
1994 wakaba 1.1 !!!next-input-character;
1995    
1996     !!!emit ($self->{ct}); # comment
1997    
1998     redo A;
1999     } elsif ($self->{nc} == -1) {
2000     !!!parse-error (type => 'unclosed comment');
2001 wakaba 1.13 if ($self->{in_subset}) {
2002     !!!cp (143.1);
2003     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2004     } else {
2005     !!!cp (143);
2006     $self->{state} = DATA_STATE;
2007     $self->{s_kwd} = '';
2008     }
2009 wakaba 1.1 ## reconsume
2010    
2011     !!!emit ($self->{ct}); # comment
2012    
2013     redo A;
2014     } else {
2015     !!!cp (144);
2016     $self->{ct}->{data} # comment
2017     .= '-' . chr ($self->{nc});
2018     $self->{state} = COMMENT_STATE;
2019     !!!next-input-character;
2020     redo A;
2021     }
2022     } elsif ($self->{state} == COMMENT_STATE) {
2023 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2024    
2025 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2026     !!!cp (145);
2027     $self->{state} = COMMENT_END_DASH_STATE;
2028     !!!next-input-character;
2029     redo A;
2030     } elsif ($self->{nc} == -1) {
2031     !!!parse-error (type => 'unclosed comment');
2032 wakaba 1.13 if ($self->{in_subset}) {
2033     !!!cp (146.1);
2034     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2035     } else {
2036     !!!cp (146);
2037     $self->{state} = DATA_STATE;
2038     $self->{s_kwd} = '';
2039     }
2040 wakaba 1.1 ## reconsume
2041    
2042     !!!emit ($self->{ct}); # comment
2043    
2044     redo A;
2045     } else {
2046     !!!cp (147);
2047     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2048     $self->{read_until}->($self->{ct}->{data},
2049     q[-],
2050     length $self->{ct}->{data});
2051    
2052     ## Stay in the state
2053     !!!next-input-character;
2054     redo A;
2055     }
2056     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2057 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2058 wakaba 1.10
2059 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2060     !!!cp (148);
2061     $self->{state} = COMMENT_END_STATE;
2062     !!!next-input-character;
2063     redo A;
2064     } elsif ($self->{nc} == -1) {
2065     !!!parse-error (type => 'unclosed comment');
2066 wakaba 1.13 if ($self->{in_subset}) {
2067     !!!cp (149.1);
2068     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2069     } else {
2070     !!!cp (149);
2071     $self->{state} = DATA_STATE;
2072     $self->{s_kwd} = '';
2073     }
2074 wakaba 1.1 ## reconsume
2075    
2076     !!!emit ($self->{ct}); # comment
2077    
2078     redo A;
2079     } else {
2080     !!!cp (150);
2081     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2082     $self->{state} = COMMENT_STATE;
2083     !!!next-input-character;
2084     redo A;
2085     }
2086 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2087     $self->{state} == COMMENT_END_BANG_STATE) {
2088 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2089 wakaba 1.31 ## (No comment end bang state.)
2090 wakaba 1.14
2091 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2092 wakaba 1.13 if ($self->{in_subset}) {
2093     !!!cp (151.1);
2094     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2095     } else {
2096     !!!cp (151);
2097     $self->{state} = DATA_STATE;
2098     $self->{s_kwd} = '';
2099     }
2100 wakaba 1.1 !!!next-input-character;
2101    
2102     !!!emit ($self->{ct}); # comment
2103    
2104     redo A;
2105     } elsif ($self->{nc} == 0x002D) { # -
2106 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2107     !!!cp (154.3);
2108     $self->{ct}->{data} .= '--!'; # comment
2109     $self->{state} = COMMENT_END_DASH_STATE;
2110     } else {
2111     !!!cp (152);
2112     ## XML5: Not a parse error.
2113     !!!parse-error (type => 'dash in comment',
2114     line => $self->{line_prev},
2115     column => $self->{column_prev});
2116     $self->{ct}->{data} .= '-'; # comment
2117     ## Stay in the state
2118     }
2119     !!!next-input-character;
2120     redo A;
2121     } elsif ($self->{nc} == 0x0021 and # !
2122     $self->{state} != COMMENT_END_BANG_STATE) {
2123     !!!parse-error (type => 'comment end bang'); # XXX error type
2124     $self->{state} = COMMENT_END_BANG_STATE;
2125 wakaba 1.1 !!!next-input-character;
2126     redo A;
2127     } elsif ($self->{nc} == -1) {
2128     !!!parse-error (type => 'unclosed comment');
2129 wakaba 1.13 if ($self->{in_subset}) {
2130     !!!cp (153.1);
2131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2132     } else {
2133     !!!cp (153);
2134     $self->{state} = DATA_STATE;
2135     $self->{s_kwd} = '';
2136     }
2137 wakaba 1.31 ## Reconsume.
2138 wakaba 1.1
2139     !!!emit ($self->{ct}); # comment
2140    
2141     redo A;
2142     } else {
2143     !!!cp (154);
2144 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2145     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2146     } else {
2147     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2148     }
2149 wakaba 1.1 $self->{state} = COMMENT_STATE;
2150     !!!next-input-character;
2151     redo A;
2152     }
2153     } elsif ($self->{state} == DOCTYPE_STATE) {
2154     if ($is_space->{$self->{nc}}) {
2155     !!!cp (155);
2156     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2157     !!!next-input-character;
2158     redo A;
2159 wakaba 1.28 } elsif ($self->{nc} == -1) {
2160     !!!cp (155.1);
2161     !!!parse-error (type => 'unclosed DOCTYPE');
2162     $self->{ct}->{quirks} = 1;
2163    
2164     $self->{state} = DATA_STATE;
2165     ## Reconsume.
2166     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2167    
2168     redo A;
2169 wakaba 1.1 } else {
2170     !!!cp (156);
2171 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2172 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2173     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2174     ## reconsume
2175     redo A;
2176     }
2177     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2178 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2179    
2180 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2181     !!!cp (157);
2182     ## Stay in the state
2183     !!!next-input-character;
2184     redo A;
2185     } elsif ($self->{nc} == 0x003E) { # >
2186     !!!cp (158);
2187 wakaba 1.12 ## XML5: No parse error.
2188 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2189     $self->{state} = DATA_STATE;
2190 wakaba 1.5 $self->{s_kwd} = '';
2191 wakaba 1.1 !!!next-input-character;
2192    
2193     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2194    
2195     redo A;
2196 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2197     !!!cp (158.1);
2198     $self->{ct}->{name} # DOCTYPE
2199     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2200     delete $self->{ct}->{quirks};
2201     $self->{state} = DOCTYPE_NAME_STATE;
2202     !!!next-input-character;
2203     redo A;
2204 wakaba 1.1 } elsif ($self->{nc} == -1) {
2205     !!!cp (159);
2206     !!!parse-error (type => 'no DOCTYPE name');
2207     $self->{state} = DATA_STATE;
2208 wakaba 1.5 $self->{s_kwd} = '';
2209 wakaba 1.1 ## reconsume
2210    
2211     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2212    
2213     redo A;
2214 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2215     !!!cp (159.1);
2216     !!!parse-error (type => 'no DOCTYPE name');
2217     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2218 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2219     $self->{in_subset} = 1;
2220 wakaba 1.12 !!!next-input-character;
2221 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2222 wakaba 1.12 redo A;
2223 wakaba 1.1 } else {
2224     !!!cp (160);
2225     $self->{ct}->{name} = chr $self->{nc};
2226     delete $self->{ct}->{quirks};
2227     $self->{state} = DOCTYPE_NAME_STATE;
2228     !!!next-input-character;
2229     redo A;
2230     }
2231     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2232 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2233    
2234     ## ISSUE: Redundant "First," in the spec.
2235    
2236 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2237     !!!cp (161);
2238     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2239     !!!next-input-character;
2240     redo A;
2241     } elsif ($self->{nc} == 0x003E) { # >
2242     !!!cp (162);
2243     $self->{state} = DATA_STATE;
2244 wakaba 1.5 $self->{s_kwd} = '';
2245 wakaba 1.1 !!!next-input-character;
2246    
2247     !!!emit ($self->{ct}); # DOCTYPE
2248    
2249     redo A;
2250 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2251     !!!cp (162.1);
2252     $self->{ct}->{name} # DOCTYPE
2253     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2254     delete $self->{ct}->{quirks};
2255     ## Stay in the state.
2256     !!!next-input-character;
2257     redo A;
2258 wakaba 1.1 } elsif ($self->{nc} == -1) {
2259     !!!cp (163);
2260     !!!parse-error (type => 'unclosed DOCTYPE');
2261     $self->{state} = DATA_STATE;
2262 wakaba 1.5 $self->{s_kwd} = '';
2263 wakaba 1.1 ## reconsume
2264    
2265     $self->{ct}->{quirks} = 1;
2266     !!!emit ($self->{ct}); # DOCTYPE
2267    
2268     redo A;
2269 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2270     !!!cp (163.1);
2271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2272 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2273     $self->{in_subset} = 1;
2274 wakaba 1.12 !!!next-input-character;
2275 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2276 wakaba 1.12 redo A;
2277 wakaba 1.1 } else {
2278     !!!cp (164);
2279 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2280     ## Stay in the state.
2281 wakaba 1.1 !!!next-input-character;
2282     redo A;
2283     }
2284     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2285 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2286     ## state", but implemented differently.
2287    
2288 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2289     !!!cp (165);
2290     ## Stay in the state
2291     !!!next-input-character;
2292     redo A;
2293     } elsif ($self->{nc} == 0x003E) { # >
2294 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2295     !!!cp (166);
2296     $self->{state} = DATA_STATE;
2297     $self->{s_kwd} = '';
2298     } else {
2299     !!!cp (166.1);
2300     !!!parse-error (type => 'no md def'); ## TODO: type
2301     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2302     }
2303    
2304 wakaba 1.1 !!!next-input-character;
2305 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2306 wakaba 1.1 redo A;
2307     } elsif ($self->{nc} == -1) {
2308 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2309     !!!cp (167);
2310     !!!parse-error (type => 'unclosed DOCTYPE');
2311     $self->{state} = DATA_STATE;
2312     $self->{s_kwd} = '';
2313     $self->{ct}->{quirks} = 1;
2314     } else {
2315     !!!cp (167.12);
2316     !!!parse-error (type => 'unclosed md'); ## TODO: type
2317     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2318     }
2319    
2320     ## Reconsume.
2321     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2322 wakaba 1.1 redo A;
2323     } elsif ($self->{nc} == 0x0050 or # P
2324     $self->{nc} == 0x0070) { # p
2325 wakaba 1.12 !!!cp (167.1);
2326 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2327 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2328 wakaba 1.1 !!!next-input-character;
2329     redo A;
2330     } elsif ($self->{nc} == 0x0053 or # S
2331     $self->{nc} == 0x0073) { # s
2332 wakaba 1.12 !!!cp (167.2);
2333 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2334 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2335     !!!next-input-character;
2336     redo A;
2337 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2338     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2339     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2340     !!!cp (167.21);
2341     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2342     $self->{ct}->{value} = ''; # ENTITY
2343     !!!next-input-character;
2344     redo A;
2345     } elsif ($self->{nc} == 0x0027 and # '
2346     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2347     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2348     !!!cp (167.22);
2349     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2350     $self->{ct}->{value} = ''; # ENTITY
2351     !!!next-input-character;
2352     redo A;
2353 wakaba 1.16 } elsif ($self->{is_xml} and
2354     $self->{ct}->{type} == DOCTYPE_TOKEN and
2355     $self->{nc} == 0x005B) { # [
2356 wakaba 1.12 !!!cp (167.3);
2357     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2358     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2359 wakaba 1.13 $self->{in_subset} = 1;
2360 wakaba 1.1 !!!next-input-character;
2361 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2362 wakaba 1.1 redo A;
2363     } else {
2364 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2365    
2366     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2367     !!!cp (180);
2368     $self->{ct}->{quirks} = 1;
2369     $self->{state} = BOGUS_DOCTYPE_STATE;
2370     } else {
2371     !!!cp (180.1);
2372     $self->{state} = BOGUS_MD_STATE;
2373     }
2374 wakaba 1.1
2375     !!!next-input-character;
2376     redo A;
2377     }
2378     } elsif ($self->{state} == PUBLIC_STATE) {
2379     ## ASCII case-insensitive
2380     if ($self->{nc} == [
2381     undef,
2382     0x0055, # U
2383     0x0042, # B
2384     0x004C, # L
2385     0x0049, # I
2386 wakaba 1.12 ]->[length $self->{kwd}] or
2387 wakaba 1.1 $self->{nc} == [
2388     undef,
2389     0x0075, # u
2390     0x0062, # b
2391     0x006C, # l
2392     0x0069, # i
2393 wakaba 1.12 ]->[length $self->{kwd}]) {
2394 wakaba 1.1 !!!cp (175);
2395     ## Stay in the state.
2396 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2397 wakaba 1.1 !!!next-input-character;
2398     redo A;
2399 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2400 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2401     $self->{nc} == 0x0063)) { # c
2402 wakaba 1.12 if ($self->{is_xml} and
2403     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2404     !!!cp (168.1);
2405     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2406     text => 'PUBLIC',
2407     line => $self->{line_prev},
2408     column => $self->{column_prev} - 4);
2409     } else {
2410     !!!cp (168);
2411     }
2412 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2413     !!!next-input-character;
2414     redo A;
2415     } else {
2416 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2417 wakaba 1.1 line => $self->{line_prev},
2418 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2419 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2420     !!!cp (169);
2421     $self->{ct}->{quirks} = 1;
2422     $self->{state} = BOGUS_DOCTYPE_STATE;
2423     } else {
2424     !!!cp (169.1);
2425     $self->{state} = BOGUS_MD_STATE;
2426     }
2427 wakaba 1.1 ## Reconsume.
2428     redo A;
2429     }
2430     } elsif ($self->{state} == SYSTEM_STATE) {
2431     ## ASCII case-insensitive
2432     if ($self->{nc} == [
2433     undef,
2434     0x0059, # Y
2435     0x0053, # S
2436     0x0054, # T
2437     0x0045, # E
2438 wakaba 1.12 ]->[length $self->{kwd}] or
2439 wakaba 1.1 $self->{nc} == [
2440     undef,
2441     0x0079, # y
2442     0x0073, # s
2443     0x0074, # t
2444     0x0065, # e
2445 wakaba 1.12 ]->[length $self->{kwd}]) {
2446 wakaba 1.1 !!!cp (170);
2447     ## Stay in the state.
2448 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2449 wakaba 1.1 !!!next-input-character;
2450     redo A;
2451 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2452 wakaba 1.1 ($self->{nc} == 0x004D or # M
2453     $self->{nc} == 0x006D)) { # m
2454 wakaba 1.12 if ($self->{is_xml} and
2455     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2456     !!!cp (171.1);
2457     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2458     text => 'SYSTEM',
2459     line => $self->{line_prev},
2460     column => $self->{column_prev} - 4);
2461     } else {
2462     !!!cp (171);
2463     }
2464 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2465     !!!next-input-character;
2466     redo A;
2467     } else {
2468 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2469 wakaba 1.1 line => $self->{line_prev},
2470 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2471 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2472     !!!cp (172);
2473     $self->{ct}->{quirks} = 1;
2474     $self->{state} = BOGUS_DOCTYPE_STATE;
2475     } else {
2476     !!!cp (172.1);
2477     $self->{state} = BOGUS_MD_STATE;
2478     }
2479 wakaba 1.1 ## Reconsume.
2480     redo A;
2481     }
2482     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2483     if ($is_space->{$self->{nc}}) {
2484     !!!cp (181);
2485     ## Stay in the state
2486     !!!next-input-character;
2487     redo A;
2488     } elsif ($self->{nc} eq 0x0022) { # "
2489     !!!cp (182);
2490     $self->{ct}->{pubid} = ''; # DOCTYPE
2491     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2492     !!!next-input-character;
2493     redo A;
2494     } elsif ($self->{nc} eq 0x0027) { # '
2495     !!!cp (183);
2496     $self->{ct}->{pubid} = ''; # DOCTYPE
2497     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2498     !!!next-input-character;
2499     redo A;
2500     } elsif ($self->{nc} eq 0x003E) { # >
2501     !!!parse-error (type => 'no PUBLIC literal');
2502 wakaba 1.16
2503     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2504     !!!cp (184);
2505     $self->{state} = DATA_STATE;
2506     $self->{s_kwd} = '';
2507     $self->{ct}->{quirks} = 1;
2508     } else {
2509     !!!cp (184.1);
2510     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2511     }
2512    
2513 wakaba 1.1 !!!next-input-character;
2514 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2515 wakaba 1.1 redo A;
2516     } elsif ($self->{nc} == -1) {
2517 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2518     !!!cp (185);
2519     !!!parse-error (type => 'unclosed DOCTYPE');
2520     $self->{state} = DATA_STATE;
2521     $self->{s_kwd} = '';
2522     $self->{ct}->{quirks} = 1;
2523     } else {
2524     !!!cp (185.1);
2525     !!!parse-error (type => 'unclosed md'); ## TODO: type
2526     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2527     }
2528    
2529 wakaba 1.1 ## reconsume
2530     !!!emit ($self->{ct}); # DOCTYPE
2531     redo A;
2532 wakaba 1.16 } elsif ($self->{is_xml} and
2533     $self->{ct}->{type} == DOCTYPE_TOKEN and
2534     $self->{nc} == 0x005B) { # [
2535 wakaba 1.12 !!!cp (186.1);
2536     !!!parse-error (type => 'no PUBLIC literal');
2537     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2538     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2539 wakaba 1.13 $self->{in_subset} = 1;
2540 wakaba 1.12 !!!next-input-character;
2541 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2542 wakaba 1.12 redo A;
2543 wakaba 1.1 } else {
2544     !!!parse-error (type => 'string after PUBLIC');
2545    
2546 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547     !!!cp (186);
2548     $self->{ct}->{quirks} = 1;
2549     $self->{state} = BOGUS_DOCTYPE_STATE;
2550     } else {
2551     !!!cp (186.2);
2552     $self->{state} = BOGUS_MD_STATE;
2553     }
2554    
2555 wakaba 1.1 !!!next-input-character;
2556     redo A;
2557     }
2558     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2559     if ($self->{nc} == 0x0022) { # "
2560     !!!cp (187);
2561     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2562     !!!next-input-character;
2563     redo A;
2564     } elsif ($self->{nc} == 0x003E) { # >
2565     !!!parse-error (type => 'unclosed PUBLIC literal');
2566    
2567 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2568     !!!cp (188);
2569     $self->{state} = DATA_STATE;
2570     $self->{s_kwd} = '';
2571     $self->{ct}->{quirks} = 1;
2572     } else {
2573     !!!cp (188.1);
2574     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2575     }
2576    
2577 wakaba 1.1 !!!next-input-character;
2578 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2579 wakaba 1.1 redo A;
2580     } elsif ($self->{nc} == -1) {
2581     !!!parse-error (type => 'unclosed PUBLIC literal');
2582    
2583 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2584     !!!cp (189);
2585     $self->{state} = DATA_STATE;
2586     $self->{s_kwd} = '';
2587     $self->{ct}->{quirks} = 1;
2588     } else {
2589     !!!cp (189.1);
2590     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2591     }
2592    
2593     ## Reconsume.
2594 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2595     redo A;
2596     } else {
2597     !!!cp (190);
2598 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2599 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2600     length $self->{ct}->{pubid});
2601    
2602     ## Stay in the state
2603     !!!next-input-character;
2604     redo A;
2605     }
2606     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2607     if ($self->{nc} == 0x0027) { # '
2608     !!!cp (191);
2609     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2610     !!!next-input-character;
2611     redo A;
2612     } elsif ($self->{nc} == 0x003E) { # >
2613     !!!parse-error (type => 'unclosed PUBLIC literal');
2614    
2615 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2616     !!!cp (192);
2617     $self->{state} = DATA_STATE;
2618     $self->{s_kwd} = '';
2619     $self->{ct}->{quirks} = 1;
2620     } else {
2621     !!!cp (192.1);
2622     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2623     }
2624    
2625 wakaba 1.1 !!!next-input-character;
2626 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2627 wakaba 1.1 redo A;
2628     } elsif ($self->{nc} == -1) {
2629     !!!parse-error (type => 'unclosed PUBLIC literal');
2630    
2631 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2632     !!!cp (193);
2633     $self->{state} = DATA_STATE;
2634     $self->{s_kwd} = '';
2635     $self->{ct}->{quirks} = 1;
2636     } else {
2637     !!!cp (193.1);
2638     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2639     }
2640    
2641 wakaba 1.1 ## reconsume
2642 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2643 wakaba 1.1 redo A;
2644     } else {
2645     !!!cp (194);
2646 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2647 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2648     length $self->{ct}->{pubid});
2649    
2650     ## Stay in the state
2651     !!!next-input-character;
2652     redo A;
2653     }
2654     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2655     if ($is_space->{$self->{nc}}) {
2656     !!!cp (195);
2657     ## Stay in the state
2658     !!!next-input-character;
2659     redo A;
2660     } elsif ($self->{nc} == 0x0022) { # "
2661     !!!cp (196);
2662 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2663 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2664     !!!next-input-character;
2665     redo A;
2666     } elsif ($self->{nc} == 0x0027) { # '
2667     !!!cp (197);
2668 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2669 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2670     !!!next-input-character;
2671     redo A;
2672     } elsif ($self->{nc} == 0x003E) { # >
2673 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2674     if ($self->{is_xml}) {
2675     !!!cp (198.1);
2676     !!!parse-error (type => 'no SYSTEM literal');
2677     } else {
2678     !!!cp (198);
2679     }
2680     $self->{state} = DATA_STATE;
2681     $self->{s_kwd} = '';
2682 wakaba 1.12 } else {
2683 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2684     !!!cp (198.2);
2685     } else {
2686     !!!cp (198.3);
2687     !!!parse-error (type => 'no SYSTEM literal');
2688     }
2689     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2690 wakaba 1.12 }
2691 wakaba 1.16
2692 wakaba 1.1 !!!next-input-character;
2693 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2694 wakaba 1.1 redo A;
2695     } elsif ($self->{nc} == -1) {
2696 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2697     !!!cp (199);
2698     !!!parse-error (type => 'unclosed DOCTYPE');
2699    
2700     $self->{state} = DATA_STATE;
2701     $self->{s_kwd} = '';
2702     $self->{ct}->{quirks} = 1;
2703     } else {
2704     !!!parse-error (type => 'unclosed md'); ## TODO: type
2705     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2706     }
2707    
2708 wakaba 1.1 ## reconsume
2709 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2710 wakaba 1.1 redo A;
2711 wakaba 1.16 } elsif ($self->{is_xml} and
2712     $self->{ct}->{type} == DOCTYPE_TOKEN and
2713     $self->{nc} == 0x005B) { # [
2714 wakaba 1.12 !!!cp (200.1);
2715     !!!parse-error (type => 'no SYSTEM literal');
2716     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2717     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2718 wakaba 1.13 $self->{in_subset} = 1;
2719 wakaba 1.12 !!!next-input-character;
2720 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2721 wakaba 1.12 redo A;
2722 wakaba 1.1 } else {
2723     !!!parse-error (type => 'string after PUBLIC literal');
2724    
2725 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2726     !!!cp (200);
2727     $self->{ct}->{quirks} = 1;
2728     $self->{state} = BOGUS_DOCTYPE_STATE;
2729     } else {
2730     !!!cp (200.2);
2731     $self->{state} = BOGUS_MD_STATE;
2732     }
2733    
2734 wakaba 1.1 !!!next-input-character;
2735     redo A;
2736     }
2737     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2738     if ($is_space->{$self->{nc}}) {
2739     !!!cp (201);
2740     ## Stay in the state
2741     !!!next-input-character;
2742     redo A;
2743     } elsif ($self->{nc} == 0x0022) { # "
2744     !!!cp (202);
2745     $self->{ct}->{sysid} = ''; # DOCTYPE
2746     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2747     !!!next-input-character;
2748     redo A;
2749     } elsif ($self->{nc} == 0x0027) { # '
2750     !!!cp (203);
2751     $self->{ct}->{sysid} = ''; # DOCTYPE
2752     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2753     !!!next-input-character;
2754     redo A;
2755     } elsif ($self->{nc} == 0x003E) { # >
2756     !!!parse-error (type => 'no SYSTEM literal');
2757     !!!next-input-character;
2758    
2759 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2760     !!!cp (204);
2761     $self->{state} = DATA_STATE;
2762     $self->{s_kwd} = '';
2763     $self->{ct}->{quirks} = 1;
2764     } else {
2765     !!!cp (204.1);
2766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2767     }
2768 wakaba 1.1
2769 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2770 wakaba 1.1 redo A;
2771     } elsif ($self->{nc} == -1) {
2772 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2773     !!!cp (205);
2774     !!!parse-error (type => 'unclosed DOCTYPE');
2775     $self->{state} = DATA_STATE;
2776     $self->{s_kwd} = '';
2777     $self->{ct}->{quirks} = 1;
2778     } else {
2779     !!!cp (205.1);
2780     !!!parse-error (type => 'unclosed md'); ## TODO: type
2781     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2782     }
2783    
2784 wakaba 1.1 ## reconsume
2785 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2786 wakaba 1.1 redo A;
2787 wakaba 1.16 } elsif ($self->{is_xml} and
2788     $self->{ct}->{type} == DOCTYPE_TOKEN and
2789     $self->{nc} == 0x005B) { # [
2790 wakaba 1.12 !!!cp (206.1);
2791     !!!parse-error (type => 'no SYSTEM literal');
2792    
2793     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2794     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2795 wakaba 1.13 $self->{in_subset} = 1;
2796 wakaba 1.12 !!!next-input-character;
2797 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2798 wakaba 1.12 redo A;
2799 wakaba 1.1 } else {
2800     !!!parse-error (type => 'string after SYSTEM');
2801    
2802 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2803     !!!cp (206);
2804     $self->{ct}->{quirks} = 1;
2805     $self->{state} = BOGUS_DOCTYPE_STATE;
2806     } else {
2807     !!!cp (206.2);
2808     $self->{state} = BOGUS_MD_STATE;
2809     }
2810    
2811 wakaba 1.1 !!!next-input-character;
2812     redo A;
2813     }
2814     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2815     if ($self->{nc} == 0x0022) { # "
2816     !!!cp (207);
2817     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2818     !!!next-input-character;
2819     redo A;
2820 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2821 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2822    
2823 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2824     !!!cp (208);
2825     $self->{state} = DATA_STATE;
2826     $self->{s_kwd} = '';
2827     $self->{ct}->{quirks} = 1;
2828     } else {
2829     !!!cp (208.1);
2830     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2831     }
2832    
2833 wakaba 1.1 !!!next-input-character;
2834 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2835 wakaba 1.1 redo A;
2836     } elsif ($self->{nc} == -1) {
2837     !!!parse-error (type => 'unclosed SYSTEM literal');
2838    
2839 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2840     !!!cp (209);
2841     $self->{state} = DATA_STATE;
2842     $self->{s_kwd} = '';
2843     $self->{ct}->{quirks} = 1;
2844     } else {
2845     !!!cp (209.1);
2846     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2847     }
2848    
2849 wakaba 1.1 ## reconsume
2850 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2851 wakaba 1.1 redo A;
2852     } else {
2853     !!!cp (210);
2854 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2855 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2856     length $self->{ct}->{sysid});
2857    
2858     ## Stay in the state
2859     !!!next-input-character;
2860     redo A;
2861     }
2862     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2863     if ($self->{nc} == 0x0027) { # '
2864     !!!cp (211);
2865     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2866     !!!next-input-character;
2867     redo A;
2868 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2869 wakaba 1.1 !!!cp (212);
2870     !!!parse-error (type => 'unclosed SYSTEM literal');
2871    
2872     $self->{state} = DATA_STATE;
2873 wakaba 1.5 $self->{s_kwd} = '';
2874 wakaba 1.1 !!!next-input-character;
2875    
2876     $self->{ct}->{quirks} = 1;
2877     !!!emit ($self->{ct}); # DOCTYPE
2878    
2879     redo A;
2880     } elsif ($self->{nc} == -1) {
2881     !!!parse-error (type => 'unclosed SYSTEM literal');
2882    
2883 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2884     !!!cp (213);
2885     $self->{state} = DATA_STATE;
2886     $self->{s_kwd} = '';
2887     $self->{ct}->{quirks} = 1;
2888     } else {
2889     !!!cp (213.1);
2890     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891     }
2892    
2893 wakaba 1.1 ## reconsume
2894 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2895 wakaba 1.1 redo A;
2896     } else {
2897     !!!cp (214);
2898 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2899 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2900     length $self->{ct}->{sysid});
2901    
2902     ## Stay in the state
2903     !!!next-input-character;
2904     redo A;
2905     }
2906     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2907     if ($is_space->{$self->{nc}}) {
2908 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2909     !!!cp (215.1);
2910     $self->{state} = BEFORE_NDATA_STATE;
2911     } else {
2912     !!!cp (215);
2913     ## Stay in the state
2914     }
2915 wakaba 1.1 !!!next-input-character;
2916     redo A;
2917     } elsif ($self->{nc} == 0x003E) { # >
2918 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2919     !!!cp (216);
2920     $self->{state} = DATA_STATE;
2921     $self->{s_kwd} = '';
2922     } else {
2923     !!!cp (216.1);
2924     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2925     }
2926    
2927 wakaba 1.1 !!!next-input-character;
2928 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2929 wakaba 1.1 redo A;
2930 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2931     ($self->{nc} == 0x004E or # N
2932     $self->{nc} == 0x006E)) { # n
2933     !!!cp (216.2);
2934     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2935     $self->{state} = NDATA_STATE;
2936     $self->{kwd} = chr $self->{nc};
2937     !!!next-input-character;
2938     redo A;
2939 wakaba 1.1 } elsif ($self->{nc} == -1) {
2940 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2941     !!!cp (217);
2942     !!!parse-error (type => 'unclosed DOCTYPE');
2943     $self->{state} = DATA_STATE;
2944     $self->{s_kwd} = '';
2945     $self->{ct}->{quirks} = 1;
2946     } else {
2947     !!!cp (217.1);
2948     !!!parse-error (type => 'unclosed md'); ## TODO: type
2949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2950     }
2951    
2952 wakaba 1.1 ## reconsume
2953 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2954 wakaba 1.1 redo A;
2955 wakaba 1.16 } elsif ($self->{is_xml} and
2956     $self->{ct}->{type} == DOCTYPE_TOKEN and
2957     $self->{nc} == 0x005B) { # [
2958 wakaba 1.12 !!!cp (218.1);
2959     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2960     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2961 wakaba 1.13 $self->{in_subset} = 1;
2962 wakaba 1.12 !!!next-input-character;
2963 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2964 wakaba 1.12 redo A;
2965 wakaba 1.1 } else {
2966     !!!parse-error (type => 'string after SYSTEM literal');
2967    
2968 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2969     !!!cp (218);
2970     #$self->{ct}->{quirks} = 1;
2971     $self->{state} = BOGUS_DOCTYPE_STATE;
2972     } else {
2973     !!!cp (218.2);
2974     $self->{state} = BOGUS_MD_STATE;
2975     }
2976    
2977 wakaba 1.1 !!!next-input-character;
2978     redo A;
2979     }
2980 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2981     if ($is_space->{$self->{nc}}) {
2982     !!!cp (218.3);
2983     ## Stay in the state.
2984     !!!next-input-character;
2985     redo A;
2986     } elsif ($self->{nc} == 0x003E) { # >
2987     !!!cp (218.4);
2988     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2989     !!!next-input-character;
2990     !!!emit ($self->{ct}); # ENTITY
2991     redo A;
2992     } elsif ($self->{nc} == 0x004E or # N
2993     $self->{nc} == 0x006E) { # n
2994     !!!cp (218.5);
2995     $self->{state} = NDATA_STATE;
2996     $self->{kwd} = chr $self->{nc};
2997     !!!next-input-character;
2998     redo A;
2999     } elsif ($self->{nc} == -1) {
3000     !!!cp (218.6);
3001     !!!parse-error (type => 'unclosed md'); ## TODO: type
3002     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3003     ## reconsume
3004     !!!emit ($self->{ct}); # ENTITY
3005     redo A;
3006     } else {
3007     !!!cp (218.7);
3008     !!!parse-error (type => 'string after SYSTEM literal');
3009     $self->{state} = BOGUS_MD_STATE;
3010     !!!next-input-character;
3011     redo A;
3012     }
3013 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3014     if ($self->{nc} == 0x003E) { # >
3015     !!!cp (219);
3016     $self->{state} = DATA_STATE;
3017 wakaba 1.5 $self->{s_kwd} = '';
3018 wakaba 1.1 !!!next-input-character;
3019    
3020     !!!emit ($self->{ct}); # DOCTYPE
3021    
3022     redo A;
3023 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3024 wakaba 1.13 !!!cp (220.1);
3025     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3026     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3027     $self->{in_subset} = 1;
3028     !!!next-input-character;
3029     !!!emit ($self->{ct}); # DOCTYPE
3030     redo A;
3031 wakaba 1.1 } elsif ($self->{nc} == -1) {
3032     !!!cp (220);
3033     $self->{state} = DATA_STATE;
3034 wakaba 1.5 $self->{s_kwd} = '';
3035 wakaba 1.1 ## reconsume
3036    
3037     !!!emit ($self->{ct}); # DOCTYPE
3038    
3039     redo A;
3040     } else {
3041     !!!cp (221);
3042     my $s = '';
3043 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3044 wakaba 1.1
3045     ## Stay in the state
3046     !!!next-input-character;
3047     redo A;
3048     }
3049     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3050     ## NOTE: "CDATA section state" in the state is jointly implemented
3051     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3052     ## and |CDATA_SECTION_MSE2_STATE|.
3053 wakaba 1.10
3054     ## XML5: "CDATA state".
3055 wakaba 1.1
3056     if ($self->{nc} == 0x005D) { # ]
3057     !!!cp (221.1);
3058     $self->{state} = CDATA_SECTION_MSE1_STATE;
3059     !!!next-input-character;
3060     redo A;
3061     } elsif ($self->{nc} == -1) {
3062 wakaba 1.6 if ($self->{is_xml}) {
3063 wakaba 1.8 !!!cp (221.11);
3064 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3065 wakaba 1.8 } else {
3066     !!!cp (221.12);
3067 wakaba 1.6 }
3068    
3069 wakaba 1.1 $self->{state} = DATA_STATE;
3070 wakaba 1.5 $self->{s_kwd} = '';
3071 wakaba 1.10 ## Reconsume.
3072 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3073     !!!cp (221.2);
3074     !!!emit ($self->{ct}); # character
3075     } else {
3076     !!!cp (221.3);
3077     ## No token to emit. $self->{ct} is discarded.
3078     }
3079     redo A;
3080     } else {
3081     !!!cp (221.4);
3082     $self->{ct}->{data} .= chr $self->{nc};
3083     $self->{read_until}->($self->{ct}->{data},
3084     q<]>,
3085     length $self->{ct}->{data});
3086    
3087     ## Stay in the state.
3088     !!!next-input-character;
3089     redo A;
3090     }
3091    
3092     ## ISSUE: "text tokens" in spec.
3093     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3094 wakaba 1.10 ## XML5: "CDATA bracket state".
3095    
3096 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3097     !!!cp (221.5);
3098     $self->{state} = CDATA_SECTION_MSE2_STATE;
3099     !!!next-input-character;
3100     redo A;
3101     } else {
3102     !!!cp (221.6);
3103 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3104 wakaba 1.1 $self->{ct}->{data} .= ']';
3105 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3106 wakaba 1.1 ## Reconsume.
3107     redo A;
3108     }
3109     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3110 wakaba 1.10 ## XML5: "CDATA end state".
3111    
3112 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3113     $self->{state} = DATA_STATE;
3114 wakaba 1.5 $self->{s_kwd} = '';
3115 wakaba 1.1 !!!next-input-character;
3116     if (length $self->{ct}->{data}) { # character
3117     !!!cp (221.7);
3118     !!!emit ($self->{ct}); # character
3119     } else {
3120     !!!cp (221.8);
3121     ## No token to emit. $self->{ct} is discarded.
3122     }
3123     redo A;
3124     } elsif ($self->{nc} == 0x005D) { # ]
3125     !!!cp (221.9); # character
3126     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3127     ## Stay in the state.
3128     !!!next-input-character;
3129     redo A;
3130     } else {
3131     !!!cp (221.11);
3132     $self->{ct}->{data} .= ']]'; # character
3133     $self->{state} = CDATA_SECTION_STATE;
3134 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3135 wakaba 1.1 redo A;
3136     }
3137     } elsif ($self->{state} == ENTITY_STATE) {
3138     if ($is_space->{$self->{nc}} or
3139     {
3140     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3141     $self->{entity_add} => 1,
3142     }->{$self->{nc}}) {
3143 wakaba 1.22 if ($self->{is_xml}) {
3144     !!!cp (1001.1);
3145     !!!parse-error (type => 'bare ero',
3146     line => $self->{line_prev},
3147     column => $self->{column_prev}
3148     + ($self->{nc} == -1 ? 1 : 0));
3149     } else {
3150     !!!cp (1001);
3151     ## No error
3152     }
3153 wakaba 1.1 ## Don't consume
3154     ## Return nothing.
3155     #
3156     } elsif ($self->{nc} == 0x0023) { # #
3157     !!!cp (999);
3158     $self->{state} = ENTITY_HASH_STATE;
3159 wakaba 1.12 $self->{kwd} = '#';
3160 wakaba 1.1 !!!next-input-character;
3161     redo A;
3162 wakaba 1.22 } elsif ($self->{is_xml} or
3163     (0x0041 <= $self->{nc} and
3164 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3165     (0x0061 <= $self->{nc} and
3166     $self->{nc} <= 0x007A)) { # a..z
3167     !!!cp (998);
3168     require Whatpm::_NamedEntityList;
3169     $self->{state} = ENTITY_NAME_STATE;
3170 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3171     $self->{entity__value} = $self->{kwd};
3172 wakaba 1.1 $self->{entity__match} = 0;
3173     !!!next-input-character;
3174     redo A;
3175     } else {
3176     !!!cp (1027);
3177     !!!parse-error (type => 'bare ero');
3178     ## Return nothing.
3179     #
3180     }
3181    
3182     ## NOTE: No character is consumed by the "consume a character
3183     ## reference" algorithm. In other word, there is an "&" character
3184     ## that does not introduce a character reference, which would be
3185     ## appended to the parent element or the attribute value in later
3186     ## process of the tokenizer.
3187    
3188     if ($self->{prev_state} == DATA_STATE) {
3189     !!!cp (997);
3190     $self->{state} = $self->{prev_state};
3191 wakaba 1.5 $self->{s_kwd} = '';
3192 wakaba 1.1 ## Reconsume.
3193     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3194     line => $self->{line_prev},
3195     column => $self->{column_prev},
3196     });
3197     redo A;
3198     } else {
3199     !!!cp (996);
3200     $self->{ca}->{value} .= '&';
3201     $self->{state} = $self->{prev_state};
3202 wakaba 1.5 $self->{s_kwd} = '';
3203 wakaba 1.1 ## Reconsume.
3204     redo A;
3205     }
3206     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3207 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3208 wakaba 1.1 !!!cp (995);
3209     $self->{state} = HEXREF_X_STATE;
3210 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3211 wakaba 1.1 !!!next-input-character;
3212     redo A;
3213 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3214     !!!cp (995.1);
3215     if ($self->{is_xml}) {
3216     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3217     }
3218     $self->{state} = HEXREF_X_STATE;
3219     $self->{kwd} .= chr $self->{nc};
3220     !!!next-input-character;
3221     redo A;
3222 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3223     $self->{nc} <= 0x0039) { # 0..9
3224     !!!cp (994);
3225     $self->{state} = NCR_NUM_STATE;
3226 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3227 wakaba 1.1 !!!next-input-character;
3228     redo A;
3229     } else {
3230     !!!parse-error (type => 'bare nero',
3231     line => $self->{line_prev},
3232     column => $self->{column_prev} - 1);
3233    
3234     ## NOTE: According to the spec algorithm, nothing is returned,
3235     ## and then "&#" is appended to the parent element or the attribute
3236     ## value in the later processing.
3237    
3238     if ($self->{prev_state} == DATA_STATE) {
3239     !!!cp (1019);
3240     $self->{state} = $self->{prev_state};
3241 wakaba 1.5 $self->{s_kwd} = '';
3242 wakaba 1.1 ## Reconsume.
3243     !!!emit ({type => CHARACTER_TOKEN,
3244     data => '&#',
3245     line => $self->{line_prev},
3246     column => $self->{column_prev} - 1,
3247     });
3248     redo A;
3249     } else {
3250     !!!cp (993);
3251     $self->{ca}->{value} .= '&#';
3252     $self->{state} = $self->{prev_state};
3253 wakaba 1.5 $self->{s_kwd} = '';
3254 wakaba 1.1 ## Reconsume.
3255     redo A;
3256     }
3257     }
3258     } elsif ($self->{state} == NCR_NUM_STATE) {
3259     if (0x0030 <= $self->{nc} and
3260     $self->{nc} <= 0x0039) { # 0..9
3261     !!!cp (1012);
3262 wakaba 1.12 $self->{kwd} *= 10;
3263     $self->{kwd} += $self->{nc} - 0x0030;
3264 wakaba 1.1
3265     ## Stay in the state.
3266     !!!next-input-character;
3267     redo A;
3268     } elsif ($self->{nc} == 0x003B) { # ;
3269     !!!cp (1013);
3270     !!!next-input-character;
3271     #
3272     } else {
3273     !!!cp (1014);
3274     !!!parse-error (type => 'no refc');
3275     ## Reconsume.
3276     #
3277     }
3278    
3279 wakaba 1.12 my $code = $self->{kwd};
3280 wakaba 1.1 my $l = $self->{line_prev};
3281     my $c = $self->{column_prev};
3282 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3283     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3284     ($self->{is_xml} and $code == 0x0000)) {
3285 wakaba 1.1 !!!cp (1015);
3286     !!!parse-error (type => 'invalid character reference',
3287     text => (sprintf 'U+%04X', $code),
3288     line => $l, column => $c);
3289     $code = $charref_map->{$code};
3290     } elsif ($code > 0x10FFFF) {
3291     !!!cp (1016);
3292     !!!parse-error (type => 'invalid character reference',
3293     text => (sprintf 'U-%08X', $code),
3294     line => $l, column => $c);
3295     $code = 0xFFFD;
3296     }
3297    
3298     if ($self->{prev_state} == DATA_STATE) {
3299     !!!cp (992);
3300     $self->{state} = $self->{prev_state};
3301 wakaba 1.5 $self->{s_kwd} = '';
3302 wakaba 1.1 ## Reconsume.
3303     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3304 wakaba 1.7 has_reference => 1,
3305 wakaba 1.1 line => $l, column => $c,
3306     });
3307     redo A;
3308     } else {
3309     !!!cp (991);
3310     $self->{ca}->{value} .= chr $code;
3311     $self->{ca}->{has_reference} = 1;
3312     $self->{state} = $self->{prev_state};
3313 wakaba 1.5 $self->{s_kwd} = '';
3314 wakaba 1.1 ## Reconsume.
3315     redo A;
3316     }
3317     } elsif ($self->{state} == HEXREF_X_STATE) {
3318     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3319     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3320     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3321     # 0..9, A..F, a..f
3322     !!!cp (990);
3323     $self->{state} = HEXREF_HEX_STATE;
3324 wakaba 1.12 $self->{kwd} = 0;
3325 wakaba 1.1 ## Reconsume.
3326     redo A;
3327     } else {
3328     !!!parse-error (type => 'bare hcro',
3329     line => $self->{line_prev},
3330     column => $self->{column_prev} - 2);
3331    
3332     ## NOTE: According to the spec algorithm, nothing is returned,
3333     ## and then "&#" followed by "X" or "x" is appended to the parent
3334     ## element or the attribute value in the later processing.
3335    
3336     if ($self->{prev_state} == DATA_STATE) {
3337     !!!cp (1005);
3338     $self->{state} = $self->{prev_state};
3339 wakaba 1.5 $self->{s_kwd} = '';
3340 wakaba 1.1 ## Reconsume.
3341     !!!emit ({type => CHARACTER_TOKEN,
3342 wakaba 1.12 data => '&' . $self->{kwd},
3343 wakaba 1.1 line => $self->{line_prev},
3344 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3345 wakaba 1.1 });
3346     redo A;
3347     } else {
3348     !!!cp (989);
3349 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3350 wakaba 1.1 $self->{state} = $self->{prev_state};
3351 wakaba 1.5 $self->{s_kwd} = '';
3352 wakaba 1.1 ## Reconsume.
3353     redo A;
3354     }
3355     }
3356     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3357     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3358     # 0..9
3359     !!!cp (1002);
3360 wakaba 1.12 $self->{kwd} *= 0x10;
3361     $self->{kwd} += $self->{nc} - 0x0030;
3362 wakaba 1.1 ## Stay in the state.
3363     !!!next-input-character;
3364     redo A;
3365     } elsif (0x0061 <= $self->{nc} and
3366     $self->{nc} <= 0x0066) { # a..f
3367     !!!cp (1003);
3368 wakaba 1.12 $self->{kwd} *= 0x10;
3369     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3370 wakaba 1.1 ## Stay in the state.
3371     !!!next-input-character;
3372     redo A;
3373     } elsif (0x0041 <= $self->{nc} and
3374     $self->{nc} <= 0x0046) { # A..F
3375     !!!cp (1004);
3376 wakaba 1.12 $self->{kwd} *= 0x10;
3377     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3378 wakaba 1.1 ## Stay in the state.
3379     !!!next-input-character;
3380     redo A;
3381     } elsif ($self->{nc} == 0x003B) { # ;
3382     !!!cp (1006);
3383     !!!next-input-character;
3384     #
3385     } else {
3386     !!!cp (1007);
3387     !!!parse-error (type => 'no refc',
3388     line => $self->{line},
3389     column => $self->{column});
3390     ## Reconsume.
3391     #
3392     }
3393    
3394 wakaba 1.12 my $code = $self->{kwd};
3395 wakaba 1.1 my $l = $self->{line_prev};
3396     my $c = $self->{column_prev};
3397 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3398     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3399     ($self->{is_xml} and $code == 0x0000)) {
3400 wakaba 1.1 !!!cp (1008);
3401     !!!parse-error (type => 'invalid character reference',
3402     text => (sprintf 'U+%04X', $code),
3403     line => $l, column => $c);
3404     $code = $charref_map->{$code};
3405     } elsif ($code > 0x10FFFF) {
3406     !!!cp (1009);
3407     !!!parse-error (type => 'invalid character reference',
3408     text => (sprintf 'U-%08X', $code),
3409     line => $l, column => $c);
3410     $code = 0xFFFD;
3411     }
3412    
3413     if ($self->{prev_state} == DATA_STATE) {
3414     !!!cp (988);
3415     $self->{state} = $self->{prev_state};
3416 wakaba 1.5 $self->{s_kwd} = '';
3417 wakaba 1.1 ## Reconsume.
3418     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3419 wakaba 1.7 has_reference => 1,
3420 wakaba 1.1 line => $l, column => $c,
3421     });
3422     redo A;
3423     } else {
3424     !!!cp (987);
3425     $self->{ca}->{value} .= chr $code;
3426     $self->{ca}->{has_reference} = 1;
3427     $self->{state} = $self->{prev_state};
3428 wakaba 1.5 $self->{s_kwd} = '';
3429 wakaba 1.1 ## Reconsume.
3430     redo A;
3431     }
3432     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3433 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3434     $self->{nc} <= 0x005A) or # x
3435     (0x0061 <= $self->{nc} and # a
3436     $self->{nc} <= 0x007A) or # z
3437     (0x0030 <= $self->{nc} and # 0
3438     $self->{nc} <= 0x0039) or # 9
3439 wakaba 1.22 $self->{nc} == 0x003B or # ;
3440     ($self->{is_xml} and
3441     not ($is_space->{$self->{nc}} or
3442     {
3443     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3444     $self->{entity_add} => 1,
3445     }->{$self->{nc}}))) {
3446 wakaba 1.1 our $EntityChar;
3447 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3448 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3449     $self->{ge}->{$self->{kwd}}) {
3450 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3451 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3452     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3453     !!!cp (1020.1);
3454     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3455     } else {
3456     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3457     !!!cp (1020.2);
3458     !!!parse-error (type => 'unparsed entity', ## TODO: type
3459     value => $self->{kwd});
3460     } else {
3461     !!!cp (1020.3);
3462     }
3463     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3464     }
3465     } else {
3466     if ($self->{is_xml}) {
3467     !!!cp (1020.4);
3468     !!!parse-error (type => 'entity not declared', ## TODO: type
3469     value => $self->{kwd},
3470     level => {
3471     'amp;' => $self->{level}->{warn},
3472     'quot;' => $self->{level}->{warn},
3473     'lt;' => $self->{level}->{warn},
3474     'gt;' => $self->{level}->{warn},
3475     'apos;' => $self->{level}->{warn},
3476     }->{$self->{kwd}} ||
3477     $self->{level}->{must});
3478     } else {
3479     !!!cp (1020);
3480     }
3481     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3482     }
3483 wakaba 1.1 $self->{entity__match} = 1;
3484     !!!next-input-character;
3485     #
3486     } else {
3487     !!!cp (1021);
3488 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3489 wakaba 1.1 $self->{entity__match} = -1;
3490     ## Stay in the state.
3491     !!!next-input-character;
3492     redo A;
3493     }
3494     } else {
3495     !!!cp (1022);
3496     $self->{entity__value} .= chr $self->{nc};
3497     $self->{entity__match} *= 2;
3498     ## Stay in the state.
3499     !!!next-input-character;
3500     redo A;
3501     }
3502     }
3503    
3504     my $data;
3505     my $has_ref;
3506     if ($self->{entity__match} > 0) {
3507     !!!cp (1023);
3508     $data = $self->{entity__value};
3509     $has_ref = 1;
3510     #
3511     } elsif ($self->{entity__match} < 0) {
3512     !!!parse-error (type => 'no refc');
3513     if ($self->{prev_state} != DATA_STATE and # in attribute
3514     $self->{entity__match} < -1) {
3515     !!!cp (1024);
3516 wakaba 1.12 $data = '&' . $self->{kwd};
3517 wakaba 1.1 #
3518     } else {
3519     !!!cp (1025);
3520     $data = $self->{entity__value};
3521     $has_ref = 1;
3522     #
3523     }
3524     } else {
3525     !!!cp (1026);
3526     !!!parse-error (type => 'bare ero',
3527     line => $self->{line_prev},
3528 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3529     $data = '&' . $self->{kwd};
3530 wakaba 1.1 #
3531     }
3532    
3533     ## NOTE: In these cases, when a character reference is found,
3534     ## it is consumed and a character token is returned, or, otherwise,
3535     ## nothing is consumed and returned, according to the spec algorithm.
3536     ## In this implementation, anything that has been examined by the
3537     ## tokenizer is appended to the parent element or the attribute value
3538     ## as string, either literal string when no character reference or
3539     ## entity-replaced string otherwise, in this stage, since any characters
3540     ## that would not be consumed are appended in the data state or in an
3541     ## appropriate attribute value state anyway.
3542    
3543     if ($self->{prev_state} == DATA_STATE) {
3544     !!!cp (986);
3545     $self->{state} = $self->{prev_state};
3546 wakaba 1.5 $self->{s_kwd} = '';
3547 wakaba 1.1 ## Reconsume.
3548     !!!emit ({type => CHARACTER_TOKEN,
3549     data => $data,
3550 wakaba 1.7 has_reference => $has_ref,
3551 wakaba 1.1 line => $self->{line_prev},
3552 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3553 wakaba 1.1 });
3554     redo A;
3555     } else {
3556     !!!cp (985);
3557     $self->{ca}->{value} .= $data;
3558     $self->{ca}->{has_reference} = 1 if $has_ref;
3559     $self->{state} = $self->{prev_state};
3560 wakaba 1.5 $self->{s_kwd} = '';
3561 wakaba 1.1 ## Reconsume.
3562     redo A;
3563     }
3564 wakaba 1.8
3565     ## XML-only states
3566    
3567     } elsif ($self->{state} == PI_STATE) {
3568 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3569    
3570 wakaba 1.8 if ($is_space->{$self->{nc}} or
3571 wakaba 1.14 $self->{nc} == 0x003F or # ?
3572 wakaba 1.8 $self->{nc} == -1) {
3573 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3574     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3575     ## "DOCTYPE pi state": Parse error, switch to the "data
3576     ## state".
3577 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3578     line => $self->{line_prev},
3579     column => $self->{column_prev}
3580     - 1 * ($self->{nc} != -1));
3581     $self->{state} = BOGUS_COMMENT_STATE;
3582     ## Reconsume.
3583     $self->{ct} = {type => COMMENT_TOKEN,
3584     data => '?',
3585     line => $self->{line_prev},
3586     column => $self->{column_prev}
3587     - 1 * ($self->{nc} != -1),
3588     };
3589     redo A;
3590     } else {
3591 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3592 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3593     target => chr $self->{nc},
3594     data => '',
3595     line => $self->{line_prev},
3596     column => $self->{column_prev} - 1,
3597     };
3598     $self->{state} = PI_TARGET_STATE;
3599     !!!next-input-character;
3600     redo A;
3601     }
3602     } elsif ($self->{state} == PI_TARGET_STATE) {
3603     if ($is_space->{$self->{nc}}) {
3604     $self->{state} = PI_TARGET_AFTER_STATE;
3605     !!!next-input-character;
3606     redo A;
3607     } elsif ($self->{nc} == -1) {
3608     !!!parse-error (type => 'no pic'); ## TODO: type
3609 wakaba 1.13 if ($self->{in_subset}) {
3610     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3611     } else {
3612     $self->{state} = DATA_STATE;
3613     $self->{s_kwd} = '';
3614     }
3615 wakaba 1.8 ## Reconsume.
3616     !!!emit ($self->{ct}); # pi
3617     redo A;
3618     } elsif ($self->{nc} == 0x003F) { # ?
3619     $self->{state} = PI_AFTER_STATE;
3620     !!!next-input-character;
3621     redo A;
3622     } else {
3623     ## XML5: typo ("tag name" -> "target")
3624     $self->{ct}->{target} .= chr $self->{nc}; # pi
3625     !!!next-input-character;
3626     redo A;
3627     }
3628     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3629     if ($is_space->{$self->{nc}}) {
3630     ## Stay in the state.
3631     !!!next-input-character;
3632     redo A;
3633     } else {
3634     $self->{state} = PI_DATA_STATE;
3635     ## Reprocess.
3636     redo A;
3637     }
3638     } elsif ($self->{state} == PI_DATA_STATE) {
3639     if ($self->{nc} == 0x003F) { # ?
3640     $self->{state} = PI_DATA_AFTER_STATE;
3641     !!!next-input-character;
3642     redo A;
3643     } elsif ($self->{nc} == -1) {
3644     !!!parse-error (type => 'no pic'); ## TODO: type
3645 wakaba 1.13 if ($self->{in_subset}) {
3646 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3647 wakaba 1.13 } else {
3648     $self->{state} = DATA_STATE;
3649     $self->{s_kwd} = '';
3650     }
3651 wakaba 1.8 ## Reprocess.
3652     !!!emit ($self->{ct}); # pi
3653     redo A;
3654     } else {
3655     $self->{ct}->{data} .= chr $self->{nc}; # pi
3656     $self->{read_until}->($self->{ct}->{data}, q[?],
3657     length $self->{ct}->{data});
3658     ## Stay in the state.
3659     !!!next-input-character;
3660     ## Reprocess.
3661     redo A;
3662     }
3663     } elsif ($self->{state} == PI_AFTER_STATE) {
3664 wakaba 1.14 ## XML5: Part of "Pi after state".
3665    
3666 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3667 wakaba 1.13 if ($self->{in_subset}) {
3668     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3669     } else {
3670     $self->{state} = DATA_STATE;
3671     $self->{s_kwd} = '';
3672     }
3673 wakaba 1.8 !!!next-input-character;
3674     !!!emit ($self->{ct}); # pi
3675     redo A;
3676     } elsif ($self->{nc} == 0x003F) { # ?
3677     !!!parse-error (type => 'no s after target', ## TODO: type
3678     line => $self->{line_prev},
3679     column => $self->{column_prev}); ## XML5: no error
3680     $self->{ct}->{data} .= '?';
3681     $self->{state} = PI_DATA_AFTER_STATE;
3682     !!!next-input-character;
3683     redo A;
3684     } else {
3685     !!!parse-error (type => 'no s after target', ## TODO: type
3686     line => $self->{line_prev},
3687     column => $self->{column_prev}
3688     + 1 * ($self->{nc} == -1)); ## XML5: no error
3689     $self->{ct}->{data} .= '?'; ## XML5: not appended
3690     $self->{state} = PI_DATA_STATE;
3691     ## Reprocess.
3692     redo A;
3693     }
3694     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3695 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3696    
3697 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3698 wakaba 1.13 if ($self->{in_subset}) {
3699     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3700     } else {
3701     $self->{state} = DATA_STATE;
3702     $self->{s_kwd} = '';
3703     }
3704 wakaba 1.8 !!!next-input-character;
3705     !!!emit ($self->{ct}); # pi
3706     redo A;
3707     } elsif ($self->{nc} == 0x003F) { # ?
3708     $self->{ct}->{data} .= '?';
3709     ## Stay in the state.
3710     !!!next-input-character;
3711     redo A;
3712     } else {
3713     $self->{ct}->{data} .= '?'; ## XML5: not appended
3714     $self->{state} = PI_DATA_STATE;
3715     ## Reprocess.
3716     redo A;
3717     }
3718 wakaba 1.12
3719     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3720     if ($self->{nc} == 0x003C) { # <
3721 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3722 wakaba 1.12 !!!next-input-character;
3723     redo A;
3724     } elsif ($self->{nc} == 0x0025) { # %
3725     ## XML5: Not defined yet.
3726    
3727     ## TODO:
3728 wakaba 1.24
3729     if (not $self->{stop_processing} and
3730     not $self->{document}->xml_standalone) {
3731     !!!parse-error (type => 'stop processing', ## TODO: type
3732     level => $self->{level}->{info});
3733     $self->{stop_processing} = 1;
3734     }
3735    
3736 wakaba 1.12 !!!next-input-character;
3737     redo A;
3738     } elsif ($self->{nc} == 0x005D) { # ]
3739 wakaba 1.13 delete $self->{in_subset};
3740 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3741     !!!next-input-character;
3742     redo A;
3743     } elsif ($is_space->{$self->{nc}}) {
3744     ## Stay in the state.
3745     !!!next-input-character;
3746     redo A;
3747     } elsif ($self->{nc} == -1) {
3748     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3749 wakaba 1.13 delete $self->{in_subset};
3750 wakaba 1.12 $self->{state} = DATA_STATE;
3751     $self->{s_kwd} = '';
3752     ## Reconsume.
3753 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3754 wakaba 1.12 redo A;
3755     } else {
3756     unless ($self->{internal_subset_tainted}) {
3757     ## XML5: No parse error.
3758     !!!parse-error (type => 'string in internal subset');
3759     $self->{internal_subset_tainted} = 1;
3760     }
3761     ## Stay in the state.
3762     !!!next-input-character;
3763     redo A;
3764     }
3765     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3766     if ($self->{nc} == 0x003E) { # >
3767     $self->{state} = DATA_STATE;
3768     $self->{s_kwd} = '';
3769     !!!next-input-character;
3770 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3771 wakaba 1.12 redo A;
3772     } elsif ($self->{nc} == -1) {
3773     !!!parse-error (type => 'unclosed DOCTYPE');
3774     $self->{state} = DATA_STATE;
3775     $self->{s_kwd} = '';
3776     ## Reconsume.
3777 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3778 wakaba 1.12 redo A;
3779     } else {
3780     ## XML5: No parse error and stay in the state.
3781     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3782    
3783 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3784     !!!next-input-character;
3785     redo A;
3786     }
3787     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3788     if ($self->{nc} == 0x003E) { # >
3789     $self->{state} = DATA_STATE;
3790     $self->{s_kwd} = '';
3791     !!!next-input-character;
3792     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3793     redo A;
3794     } elsif ($self->{nc} == -1) {
3795     $self->{state} = DATA_STATE;
3796     $self->{s_kwd} = '';
3797     ## Reconsume.
3798     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3799     redo A;
3800     } else {
3801     ## Stay in the state.
3802     !!!next-input-character;
3803     redo A;
3804     }
3805     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3806     if ($self->{nc} == 0x0021) { # !
3807 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3808 wakaba 1.13 !!!next-input-character;
3809     redo A;
3810     } elsif ($self->{nc} == 0x003F) { # ?
3811     $self->{state} = PI_STATE;
3812     !!!next-input-character;
3813     redo A;
3814     } elsif ($self->{nc} == -1) {
3815     !!!parse-error (type => 'bare stago');
3816     $self->{state} = DATA_STATE;
3817     $self->{s_kwd} = '';
3818     ## Reconsume.
3819     redo A;
3820     } else {
3821     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3822     line => $self->{line_prev},
3823     column => $self->{column_prev});
3824     $self->{state} = BOGUS_COMMENT_STATE;
3825     $self->{ct} = {type => COMMENT_TOKEN,
3826     data => '',
3827     }; ## NOTE: Will be discarded.
3828 wakaba 1.12 !!!next-input-character;
3829     redo A;
3830     }
3831 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3832     ## XML5: "DOCTYPE markup declaration state".
3833    
3834     if ($self->{nc} == 0x002D) { # -
3835     $self->{state} = MD_HYPHEN_STATE;
3836     !!!next-input-character;
3837     redo A;
3838 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3839     $self->{nc} == 0x0065) { # e
3840 wakaba 1.14 $self->{state} = MD_E_STATE;
3841     $self->{kwd} = chr $self->{nc};
3842     !!!next-input-character;
3843     redo A;
3844 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3845     $self->{nc} == 0x0061) { # a
3846 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3847     $self->{kwd} = chr $self->{nc};
3848     !!!next-input-character;
3849     redo A;
3850 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3851     $self->{nc} == 0x006E) { # n
3852 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3853     $self->{kwd} = chr $self->{nc};
3854     !!!next-input-character;
3855     redo A;
3856     } else {
3857     #
3858     }
3859    
3860     ## XML5: No parse error.
3861     !!!parse-error (type => 'bogus comment',
3862     line => $self->{line_prev},
3863     column => $self->{column_prev} - 1);
3864     ## Reconsume.
3865     $self->{state} = BOGUS_COMMENT_STATE;
3866     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3867     redo A;
3868     } elsif ($self->{state} == MD_E_STATE) {
3869 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3870     $self->{nc} == 0x006E) { # n
3871 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3872     $self->{kwd} .= chr $self->{nc};
3873     !!!next-input-character;
3874     redo A;
3875 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3876     $self->{nc} == 0x006C) { # l
3877 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3878     $self->{state} = MD_ELEMENT_STATE;
3879     $self->{kwd} .= chr $self->{nc};
3880     !!!next-input-character;
3881     redo A;
3882     } else {
3883     ## XML5: No parse error.
3884     !!!parse-error (type => 'bogus comment',
3885     line => $self->{line_prev},
3886     column => $self->{column_prev} - 2
3887     + 1 * ($self->{nc} == -1));
3888     ## Reconsume.
3889     $self->{state} = BOGUS_COMMENT_STATE;
3890     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3891     redo A;
3892     }
3893     } elsif ($self->{state} == MD_ENTITY_STATE) {
3894 wakaba 1.17 if ($self->{nc} == [
3895     undef,
3896     undef,
3897     0x0054, # T
3898     0x0049, # I
3899     0x0054, # T
3900     ]->[length $self->{kwd}] or
3901     $self->{nc} == [
3902     undef,
3903     undef,
3904     0x0074, # t
3905     0x0069, # i
3906     0x0074, # t
3907     ]->[length $self->{kwd}]) {
3908 wakaba 1.14 ## Stay in the state.
3909     $self->{kwd} .= chr $self->{nc};
3910     !!!next-input-character;
3911     redo A;
3912 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3913     ($self->{nc} == 0x0059 or # Y
3914     $self->{nc} == 0x0079)) { # y
3915     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3916     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3917     text => 'ENTITY',
3918     line => $self->{line_prev},
3919     column => $self->{column_prev} - 4);
3920     }
3921     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3922 wakaba 1.14 line => $self->{line_prev},
3923     column => $self->{column_prev} - 6};
3924     $self->{state} = DOCTYPE_MD_STATE;
3925     !!!next-input-character;
3926     redo A;
3927     } else {
3928     !!!parse-error (type => 'bogus comment',
3929     line => $self->{line_prev},
3930     column => $self->{column_prev} - 1
3931     - (length $self->{kwd})
3932     + 1 * ($self->{nc} == -1));
3933     $self->{state} = BOGUS_COMMENT_STATE;
3934     ## Reconsume.
3935     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3936     redo A;
3937     }
3938     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3939 wakaba 1.17 if ($self->{nc} == [
3940     undef,
3941     undef,
3942     0x0045, # E
3943     0x004D, # M
3944     0x0045, # E
3945     0x004E, # N
3946     ]->[length $self->{kwd}] or
3947     $self->{nc} == [
3948     undef,
3949     undef,
3950     0x0065, # e
3951     0x006D, # m
3952     0x0065, # e
3953     0x006E, # n
3954     ]->[length $self->{kwd}]) {
3955 wakaba 1.14 ## Stay in the state.
3956     $self->{kwd} .= chr $self->{nc};
3957     !!!next-input-character;
3958     redo A;
3959 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3960     ($self->{nc} == 0x0054 or # T
3961     $self->{nc} == 0x0074)) { # t
3962     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3963     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3964     text => 'ELEMENT',
3965     line => $self->{line_prev},
3966     column => $self->{column_prev} - 5);
3967     }
3968 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3969     line => $self->{line_prev},
3970 wakaba 1.23 column => $self->{column_prev} - 7};
3971 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3972     !!!next-input-character;
3973     redo A;
3974     } else {
3975     !!!parse-error (type => 'bogus comment',
3976     line => $self->{line_prev},
3977     column => $self->{column_prev} - 1
3978     - (length $self->{kwd})
3979     + 1 * ($self->{nc} == -1));
3980     $self->{state} = BOGUS_COMMENT_STATE;
3981     ## Reconsume.
3982     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3983     redo A;
3984     }
3985     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3986 wakaba 1.17 if ($self->{nc} == [
3987     undef,
3988     0x0054, # T
3989     0x0054, # T
3990     0x004C, # L
3991     0x0049, # I
3992     0x0053, # S
3993     ]->[length $self->{kwd}] or
3994     $self->{nc} == [
3995     undef,
3996     0x0074, # t
3997     0x0074, # t
3998     0x006C, # l
3999     0x0069, # i
4000     0x0073, # s
4001     ]->[length $self->{kwd}]) {
4002 wakaba 1.14 ## Stay in the state.
4003     $self->{kwd} .= chr $self->{nc};
4004     !!!next-input-character;
4005     redo A;
4006 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
4007     ($self->{nc} == 0x0054 or # T
4008     $self->{nc} == 0x0074)) { # t
4009     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4010     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4011     text => 'ATTLIST',
4012     line => $self->{line_prev},
4013     column => $self->{column_prev} - 5);
4014     }
4015 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4016 wakaba 1.15 attrdefs => [],
4017 wakaba 1.14 line => $self->{line_prev},
4018 wakaba 1.23 column => $self->{column_prev} - 7};
4019 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4020     !!!next-input-character;
4021     redo A;
4022     } else {
4023     !!!parse-error (type => 'bogus comment',
4024     line => $self->{line_prev},
4025     column => $self->{column_prev} - 1
4026     - (length $self->{kwd})
4027     + 1 * ($self->{nc} == -1));
4028     $self->{state} = BOGUS_COMMENT_STATE;
4029     ## Reconsume.
4030     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4031     redo A;
4032     }
4033     } elsif ($self->{state} == MD_NOTATION_STATE) {
4034 wakaba 1.17 if ($self->{nc} == [
4035     undef,
4036     0x004F, # O
4037     0x0054, # T
4038     0x0041, # A
4039     0x0054, # T
4040     0x0049, # I
4041     0x004F, # O
4042     ]->[length $self->{kwd}] or
4043     $self->{nc} == [
4044     undef,
4045     0x006F, # o
4046     0x0074, # t
4047     0x0061, # a
4048     0x0074, # t
4049     0x0069, # i
4050     0x006F, # o
4051     ]->[length $self->{kwd}]) {
4052 wakaba 1.14 ## Stay in the state.
4053     $self->{kwd} .= chr $self->{nc};
4054     !!!next-input-character;
4055     redo A;
4056 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4057     ($self->{nc} == 0x004E or # N
4058     $self->{nc} == 0x006E)) { # n
4059     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4060     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4061     text => 'NOTATION',
4062     line => $self->{line_prev},
4063     column => $self->{column_prev} - 6);
4064     }
4065 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4066     line => $self->{line_prev},
4067 wakaba 1.23 column => $self->{column_prev} - 8};
4068 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4069     !!!next-input-character;
4070     redo A;
4071     } else {
4072     !!!parse-error (type => 'bogus comment',
4073     line => $self->{line_prev},
4074     column => $self->{column_prev} - 1
4075     - (length $self->{kwd})
4076     + 1 * ($self->{nc} == -1));
4077     $self->{state} = BOGUS_COMMENT_STATE;
4078     ## Reconsume.
4079     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4080     redo A;
4081     }
4082     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4083     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4084     ## "DOCTYPE NOTATION state".
4085    
4086     if ($is_space->{$self->{nc}}) {
4087     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4088     $self->{state} = BEFORE_MD_NAME_STATE;
4089     !!!next-input-character;
4090     redo A;
4091     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4092     $self->{nc} == 0x0025) { # %
4093     ## XML5: Switch to the "DOCTYPE bogus comment state".
4094     !!!parse-error (type => 'no space before md name'); ## TODO: type
4095     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4096     !!!next-input-character;
4097     redo A;
4098     } elsif ($self->{nc} == -1) {
4099     !!!parse-error (type => 'unclosed md'); ## TODO: type
4100     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4101     ## Reconsume.
4102     redo A;
4103     } elsif ($self->{nc} == 0x003E) { # >
4104     ## XML5: Switch to the "DOCTYPE bogus comment state".
4105     !!!parse-error (type => 'no md name'); ## TODO: type
4106     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4107     !!!next-input-character;
4108     redo A;
4109     } else {
4110     ## XML5: Switch to the "DOCTYPE bogus comment state".
4111     !!!parse-error (type => 'no space before md name'); ## TODO: type
4112     $self->{state} = BEFORE_MD_NAME_STATE;
4113     redo A;
4114     }
4115     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4116     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4117     ## before state", "DOCTYPE ATTLIST name before state".
4118    
4119     if ($is_space->{$self->{nc}}) {
4120     ## Stay in the state.
4121     !!!next-input-character;
4122     redo A;
4123     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4124     $self->{nc} == 0x0025) { # %
4125     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4126     !!!next-input-character;
4127     redo A;
4128     } elsif ($self->{nc} == 0x003E) { # >
4129     ## XML5: Same as "Anything else".
4130     !!!parse-error (type => 'no md name'); ## TODO: type
4131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4132     !!!next-input-character;
4133     redo A;
4134     } elsif ($self->{nc} == -1) {
4135     !!!parse-error (type => 'unclosed md'); ## TODO: type
4136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137     ## Reconsume.
4138     redo A;
4139     } else {
4140     ## XML5: [ATTLIST] Not defined yet.
4141     $self->{ct}->{name} .= chr $self->{nc};
4142     $self->{state} = MD_NAME_STATE;
4143     !!!next-input-character;
4144     redo A;
4145     }
4146     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4147     if ($is_space->{$self->{nc}}) {
4148     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4149     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4150     $self->{state} = BEFORE_MD_NAME_STATE;
4151     !!!next-input-character;
4152     redo A;
4153     } elsif ($self->{nc} == 0x003E) { # >
4154     ## XML5: Same as "Anything else".
4155     !!!parse-error (type => 'no md name'); ## TODO: type
4156     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4157     !!!next-input-character;
4158     redo A;
4159     } elsif ($self->{nc} == -1) {
4160     !!!parse-error (type => 'unclosed md');
4161     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4162     ## Reconsume.
4163     redo A;
4164     } else {
4165     ## XML5: No parse error.
4166     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4167     $self->{state} = BOGUS_COMMENT_STATE;
4168     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4169     ## Reconsume.
4170     redo A;
4171     }
4172     } elsif ($self->{state} == MD_NAME_STATE) {
4173     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4174    
4175     if ($is_space->{$self->{nc}}) {
4176 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4177     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4178     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4179 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4180 wakaba 1.16 } else { # ENTITY/NOTATION
4181     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4182     }
4183 wakaba 1.14 !!!next-input-character;
4184     redo A;
4185     } elsif ($self->{nc} == 0x003E) { # >
4186     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4187     #
4188     } else {
4189 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4190 wakaba 1.14 }
4191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192     !!!next-input-character;
4193     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4194     redo A;
4195     } elsif ($self->{nc} == -1) {
4196     ## XML5: [ATTLIST] No parse error.
4197     !!!parse-error (type => 'unclosed md');
4198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4199     ## Reconsume.
4200     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4201     redo A;
4202     } else {
4203     ## XML5: [ATTLIST] Not defined yet.
4204     $self->{ct}->{name} .= chr $self->{nc};
4205     ## Stay in the state.
4206     !!!next-input-character;
4207     redo A;
4208     }
4209     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4210     if ($is_space->{$self->{nc}}) {
4211     ## Stay in the state.
4212     !!!next-input-character;
4213     redo A;
4214     } elsif ($self->{nc} == 0x003E) { # >
4215     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4216     !!!next-input-character;
4217     !!!emit ($self->{ct}); # ATTLIST
4218     redo A;
4219     } elsif ($self->{nc} == -1) {
4220     ## XML5: No parse error.
4221     !!!parse-error (type => 'unclosed md'); ## TODO: type
4222     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4223 wakaba 1.15 !!!emit ($self->{ct});
4224     redo A;
4225     } else {
4226     ## XML5: Not defined yet.
4227     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4228     tokens => [],
4229     line => $self->{line}, column => $self->{column}};
4230     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4231     !!!next-input-character;
4232     redo A;
4233     }
4234     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4235     if ($is_space->{$self->{nc}}) {
4236     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4237     !!!next-input-character;
4238     redo A;
4239     } elsif ($self->{nc} == 0x003E) { # >
4240     ## XML5: Same as "anything else".
4241     !!!parse-error (type => 'no attr type'); ## TODO: type
4242     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4243     !!!next-input-character;
4244     !!!emit ($self->{ct}); # ATTLIST
4245     redo A;
4246     } elsif ($self->{nc} == 0x0028) { # (
4247     ## XML5: Same as "anything else".
4248     !!!parse-error (type => 'no space before paren'); ## TODO: type
4249     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4250     !!!next-input-character;
4251     redo A;
4252     } elsif ($self->{nc} == -1) {
4253     ## XML5: No parse error.
4254     !!!parse-error (type => 'unclosed md'); ## TODO: type
4255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4256     !!!next-input-character;
4257     !!!emit ($self->{ct}); # ATTLIST
4258     redo A;
4259     } else {
4260     ## XML5: Not defined yet.
4261     $self->{ca}->{name} .= chr $self->{nc};
4262     ## Stay in the state.
4263     !!!next-input-character;
4264     redo A;
4265     }
4266     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4267     if ($is_space->{$self->{nc}}) {
4268     ## Stay in the state.
4269     !!!next-input-character;
4270     redo A;
4271     } elsif ($self->{nc} == 0x003E) { # >
4272     ## XML5: Same as "anything else".
4273     !!!parse-error (type => 'no attr type'); ## TODO: type
4274     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275     !!!next-input-character;
4276     !!!emit ($self->{ct}); # ATTLIST
4277     redo A;
4278     } elsif ($self->{nc} == 0x0028) { # (
4279     ## XML5: Same as "anything else".
4280     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4281     !!!next-input-character;
4282     redo A;
4283     } elsif ($self->{nc} == -1) {
4284     ## XML5: No parse error.
4285     !!!parse-error (type => 'unclosed md'); ## TODO: type
4286     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4287     !!!next-input-character;
4288     !!!emit ($self->{ct});
4289 wakaba 1.14 redo A;
4290     } else {
4291     ## XML5: Not defined yet.
4292 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4293     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4294     !!!next-input-character;
4295     redo A;
4296     }
4297     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4298     if ($is_space->{$self->{nc}}) {
4299     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4300     !!!next-input-character;
4301     redo A;
4302     } elsif ($self->{nc} == 0x0023) { # #
4303     ## XML5: Same as "anything else".
4304     !!!parse-error (type => 'no space before default value'); ## TODO: type
4305     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4306     !!!next-input-character;
4307     redo A;
4308     } elsif ($self->{nc} == 0x0022) { # "
4309     ## XML5: Same as "anything else".
4310     !!!parse-error (type => 'no space before default value'); ## TODO: type
4311     $self->{ca}->{value} = '';
4312     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4313     !!!next-input-character;
4314     redo A;
4315     } elsif ($self->{nc} == 0x0027) { # '
4316     ## XML5: Same as "anything else".
4317     !!!parse-error (type => 'no space before default value'); ## TODO: type
4318     $self->{ca}->{value} = '';
4319     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4320     !!!next-input-character;
4321     redo A;
4322     } elsif ($self->{nc} == 0x003E) { # >
4323     ## XML5: Same as "anything else".
4324     !!!parse-error (type => 'no attr default'); ## TODO: type
4325     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4326     !!!next-input-character;
4327     !!!emit ($self->{ct}); # ATTLIST
4328     redo A;
4329     } elsif ($self->{nc} == 0x0028) { # (
4330     ## XML5: Same as "anything else".
4331     !!!parse-error (type => 'no space before paren'); ## TODO: type
4332     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4333     !!!next-input-character;
4334     redo A;
4335     } elsif ($self->{nc} == -1) {
4336     ## XML5: No parse error.
4337     !!!parse-error (type => 'unclosed md'); ## TODO: type
4338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4339     !!!next-input-character;
4340     !!!emit ($self->{ct});
4341     redo A;
4342     } else {
4343     ## XML5: Not defined yet.
4344     $self->{ca}->{type} .= chr $self->{nc};
4345     ## Stay in the state.
4346     !!!next-input-character;
4347     redo A;
4348     }
4349     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4350     if ($is_space->{$self->{nc}}) {
4351     ## Stay in the state.
4352     !!!next-input-character;
4353     redo A;
4354     } elsif ($self->{nc} == 0x0028) { # (
4355     ## XML5: Same as "anything else".
4356     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4357     !!!next-input-character;
4358     redo A;
4359     } elsif ($self->{nc} == 0x0023) { # #
4360     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4361     !!!next-input-character;
4362     redo A;
4363     } elsif ($self->{nc} == 0x0022) { # "
4364     ## XML5: Same as "anything else".
4365     $self->{ca}->{value} = '';
4366     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4367     !!!next-input-character;
4368     redo A;
4369     } elsif ($self->{nc} == 0x0027) { # '
4370     ## XML5: Same as "anything else".
4371     $self->{ca}->{value} = '';
4372     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4373     !!!next-input-character;
4374     redo A;
4375     } elsif ($self->{nc} == 0x003E) { # >
4376     ## XML5: Same as "anything else".
4377     !!!parse-error (type => 'no attr default'); ## TODO: type
4378     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4379     !!!next-input-character;
4380     !!!emit ($self->{ct}); # ATTLIST
4381     redo A;
4382     } elsif ($self->{nc} == -1) {
4383     ## XML5: No parse error.
4384     !!!parse-error (type => 'unclosed md'); ## TODO: type
4385     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4386     !!!next-input-character;
4387     !!!emit ($self->{ct});
4388     redo A;
4389     } else {
4390     ## XML5: Switch to the "DOCTYPE bogus comment state".
4391     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4392     $self->{ca}->{value} = '';
4393     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4394     ## Reconsume.
4395     redo A;
4396     }
4397     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4398     if ($is_space->{$self->{nc}}) {
4399     ## Stay in the state.
4400     !!!next-input-character;
4401     redo A;
4402     } elsif ($self->{nc} == 0x007C) { # |
4403     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4404     ## Stay in the state.
4405     !!!next-input-character;
4406     redo A;
4407     } elsif ($self->{nc} == 0x0029) { # )
4408     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4409     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4410     !!!next-input-character;
4411     redo A;
4412     } elsif ($self->{nc} == 0x003E) { # >
4413     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4414     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4415     !!!next-input-character;
4416     !!!emit ($self->{ct}); # ATTLIST
4417     redo A;
4418     } elsif ($self->{nc} == -1) {
4419     ## XML5: No parse error.
4420     !!!parse-error (type => 'unclosed md'); ## TODO: type
4421     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4422     !!!next-input-character;
4423     !!!emit ($self->{ct});
4424     redo A;
4425     } else {
4426     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4427     $self->{state} = ALLOWED_TOKEN_STATE;
4428     !!!next-input-character;
4429     redo A;
4430     }
4431     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4432     if ($is_space->{$self->{nc}}) {
4433     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4434     !!!next-input-character;
4435     redo A;
4436     } elsif ($self->{nc} == 0x007C) { # |
4437     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4438     !!!next-input-character;
4439     redo A;
4440     } elsif ($self->{nc} == 0x0029) { # )
4441     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4442     !!!next-input-character;
4443     redo A;
4444     } elsif ($self->{nc} == 0x003E) { # >
4445     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4446     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4447     !!!next-input-character;
4448     !!!emit ($self->{ct}); # ATTLIST
4449     redo A;
4450     } elsif ($self->{nc} == -1) {
4451     ## XML5: No parse error.
4452     !!!parse-error (type => 'unclosed md'); ## TODO: type
4453     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4454     !!!next-input-character;
4455     !!!emit ($self->{ct});
4456     redo A;
4457     } else {
4458     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4459     ## Stay in the state.
4460     !!!next-input-character;
4461     redo A;
4462     }
4463     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4464     if ($is_space->{$self->{nc}}) {
4465     ## Stay in the state.
4466     !!!next-input-character;
4467     redo A;
4468     } elsif ($self->{nc} == 0x007C) { # |
4469     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4470     !!!next-input-character;
4471     redo A;
4472     } elsif ($self->{nc} == 0x0029) { # )
4473     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4474     !!!next-input-character;
4475     redo A;
4476     } elsif ($self->{nc} == 0x003E) { # >
4477     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479     !!!next-input-character;
4480     !!!emit ($self->{ct}); # ATTLIST
4481     redo A;
4482     } elsif ($self->{nc} == -1) {
4483     ## XML5: No parse error.
4484     !!!parse-error (type => 'unclosed md'); ## TODO: type
4485     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4486     !!!next-input-character;
4487     !!!emit ($self->{ct});
4488     redo A;
4489     } else {
4490     !!!parse-error (type => 'space in allowed token', ## TODO: type
4491     line => $self->{line_prev},
4492     column => $self->{column_prev});
4493     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4494     $self->{state} = ALLOWED_TOKEN_STATE;
4495     !!!next-input-character;
4496     redo A;
4497     }
4498     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4499     if ($is_space->{$self->{nc}}) {
4500     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4501     !!!next-input-character;
4502     redo A;
4503     } elsif ($self->{nc} == 0x0023) { # #
4504     !!!parse-error (type => 'no space before default value'); ## TODO: type
4505     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4506     !!!next-input-character;
4507     redo A;
4508     } elsif ($self->{nc} == 0x0022) { # "
4509     !!!parse-error (type => 'no space before default value'); ## TODO: type
4510     $self->{ca}->{value} = '';
4511     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512     !!!next-input-character;
4513     redo A;
4514     } elsif ($self->{nc} == 0x0027) { # '
4515     !!!parse-error (type => 'no space before default value'); ## TODO: type
4516     $self->{ca}->{value} = '';
4517     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4518     !!!next-input-character;
4519     redo A;
4520     } elsif ($self->{nc} == 0x003E) { # >
4521     !!!parse-error (type => 'no attr default'); ## TODO: type
4522     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4523     !!!next-input-character;
4524     !!!emit ($self->{ct}); # ATTLIST
4525     redo A;
4526     } elsif ($self->{nc} == -1) {
4527     !!!parse-error (type => 'unclosed md'); ## TODO: type
4528     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4529     !!!next-input-character;
4530     !!!emit ($self->{ct});
4531     redo A;
4532     } else {
4533     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4534     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4535     ## Reconsume.
4536     redo A;
4537     }
4538     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4539     if ($is_space->{$self->{nc}}) {
4540     ## Stay in the state.
4541     !!!next-input-character;
4542     redo A;
4543     } elsif ($self->{nc} == 0x0023) { # #
4544     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4545     !!!next-input-character;
4546     redo A;
4547     } elsif ($self->{nc} == 0x0022) { # "
4548     $self->{ca}->{value} = '';
4549     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4550     !!!next-input-character;
4551     redo A;
4552     } elsif ($self->{nc} == 0x0027) { # '
4553     $self->{ca}->{value} = '';
4554     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4555     !!!next-input-character;
4556     redo A;
4557     } elsif ($self->{nc} == 0x003E) { # >
4558     !!!parse-error (type => 'no attr default'); ## TODO: type
4559     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4560     !!!next-input-character;
4561     !!!emit ($self->{ct}); # ATTLIST
4562     redo A;
4563     } elsif ($self->{nc} == -1) {
4564     !!!parse-error (type => 'unclosed md'); ## TODO: type
4565     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4566     !!!next-input-character;
4567     !!!emit ($self->{ct});
4568     redo A;
4569     } else {
4570     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4571     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4572     ## Reconsume.
4573     redo A;
4574     }
4575     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4576     if ($is_space->{$self->{nc}}) {
4577     ## XML5: No parse error.
4578     !!!parse-error (type => 'no default type'); ## TODO: type
4579 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4580 wakaba 1.14 ## Reconsume.
4581     redo A;
4582 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4583     ## XML5: Same as "anything else".
4584     $self->{ca}->{value} = '';
4585     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4586     !!!next-input-character;
4587     redo A;
4588     } elsif ($self->{nc} == 0x0027) { # '
4589     ## XML5: Same as "anything else".
4590     $self->{ca}->{value} = '';
4591     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4592     !!!next-input-character;
4593     redo A;
4594     } elsif ($self->{nc} == 0x003E) { # >
4595     ## XML5: Same as "anything else".
4596     !!!parse-error (type => 'no attr default'); ## TODO: type
4597     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4598     !!!next-input-character;
4599     !!!emit ($self->{ct}); # ATTLIST
4600     redo A;
4601     } elsif ($self->{nc} == -1) {
4602     ## XML5: No parse error.
4603     !!!parse-error (type => 'unclosed md'); ## TODO: type
4604     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4605     !!!next-input-character;
4606     !!!emit ($self->{ct});
4607     redo A;
4608     } else {
4609     $self->{ca}->{default} = chr $self->{nc};
4610     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4611     !!!next-input-character;
4612     redo A;
4613 wakaba 1.14 }
4614 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4615     if ($is_space->{$self->{nc}}) {
4616     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4617     !!!next-input-character;
4618     redo A;
4619     } elsif ($self->{nc} == 0x0022) { # "
4620     ## XML5: Same as "anything else".
4621     !!!parse-error (type => 'no space before default value'); ## TODO: type
4622     $self->{ca}->{value} = '';
4623     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4624     !!!next-input-character;
4625     redo A;
4626     } elsif ($self->{nc} == 0x0027) { # '
4627     ## XML5: Same as "anything else".
4628     !!!parse-error (type => 'no space before default value'); ## TODO: type
4629     $self->{ca}->{value} = '';
4630     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4631     !!!next-input-character;
4632     redo A;
4633     } elsif ($self->{nc} == 0x003E) { # >
4634     ## XML5: Same as "anything else".
4635     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4636     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4637     !!!next-input-character;
4638     !!!emit ($self->{ct}); # ATTLIST
4639     redo A;
4640     } elsif ($self->{nc} == -1) {
4641     ## XML5: No parse error.
4642     !!!parse-error (type => 'unclosed md'); ## TODO: type
4643     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4644     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4645     !!!next-input-character;
4646     !!!emit ($self->{ct});
4647     redo A;
4648     } else {
4649     $self->{ca}->{default} .= chr $self->{nc};
4650     ## Stay in the state.
4651     !!!next-input-character;
4652     redo A;
4653     }
4654     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4655     if ($is_space->{$self->{nc}}) {
4656     ## Stay in the state.
4657     !!!next-input-character;
4658     redo A;
4659     } elsif ($self->{nc} == 0x0022) { # "
4660     $self->{ca}->{value} = '';
4661     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4662     !!!next-input-character;
4663     redo A;
4664     } elsif ($self->{nc} == 0x0027) { # '
4665     $self->{ca}->{value} = '';
4666     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4667     !!!next-input-character;
4668     redo A;
4669     } elsif ($self->{nc} == 0x003E) { # >
4670     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4672     !!!next-input-character;
4673     !!!emit ($self->{ct}); # ATTLIST
4674     redo A;
4675     } elsif ($self->{nc} == -1) {
4676     ## XML5: No parse error.
4677     !!!parse-error (type => 'unclosed md'); ## TODO: type
4678     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4679     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4680     !!!next-input-character;
4681     !!!emit ($self->{ct});
4682     redo A;
4683     } else {
4684     ## XML5: Not defined yet.
4685     if ($self->{ca}->{default} eq 'FIXED') {
4686     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4687     } else {
4688     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4689     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4690     }
4691     ## Reconsume.
4692     redo A;
4693     }
4694     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4695     if ($is_space->{$self->{nc}} or
4696     $self->{nc} == -1 or
4697     $self->{nc} == 0x003E) { # >
4698     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4699     ## Reconsume.
4700     redo A;
4701     } else {
4702     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4703     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4704     ## Reconsume.
4705     redo A;
4706 wakaba 1.16 }
4707 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4708     ## ASCII case-insensitive
4709     if ($self->{nc} == [
4710     undef,
4711     0x0044, # D
4712     0x0041, # A
4713     0x0054, # T
4714     ]->[length $self->{kwd}] or
4715     $self->{nc} == [
4716     undef,
4717     0x0064, # d
4718     0x0061, # a
4719     0x0074, # t
4720     ]->[length $self->{kwd}]) {
4721     !!!cp (172.2);
4722     ## Stay in the state.
4723     $self->{kwd} .= chr $self->{nc};
4724     !!!next-input-character;
4725     redo A;
4726     } elsif ((length $self->{kwd}) == 4 and
4727     ($self->{nc} == 0x0041 or # A
4728     $self->{nc} == 0x0061)) { # a
4729     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4730     !!!cp (172.3);
4731     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4732     text => 'NDATA',
4733     line => $self->{line_prev},
4734     column => $self->{column_prev} - 4);
4735     } else {
4736     !!!cp (172.4);
4737     }
4738     $self->{state} = AFTER_NDATA_STATE;
4739     !!!next-input-character;
4740     redo A;
4741     } else {
4742     !!!parse-error (type => 'string after literal', ## TODO: type
4743     line => $self->{line_prev},
4744     column => $self->{column_prev} + 1
4745     - length $self->{kwd});
4746     !!!cp (172.5);
4747     $self->{state} = BOGUS_MD_STATE;
4748     ## Reconsume.
4749     redo A;
4750     }
4751     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4752     if ($is_space->{$self->{nc}}) {
4753     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4754     !!!next-input-character;
4755     redo A;
4756     } elsif ($self->{nc} == 0x003E) { # >
4757     !!!parse-error (type => 'no notation name'); ## TODO: type
4758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4759     !!!next-input-character;
4760     !!!emit ($self->{ct}); # ENTITY
4761     redo A;
4762     } elsif ($self->{nc} == -1) {
4763     !!!parse-error (type => 'unclosed md'); ## TODO: type
4764     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4765     !!!next-input-character;
4766     !!!emit ($self->{ct}); # ENTITY
4767     redo A;
4768     } else {
4769     !!!parse-error (type => 'string after literal', ## TODO: type
4770     line => $self->{line_prev},
4771     column => $self->{column_prev} + 1
4772     - length $self->{kwd});
4773     $self->{state} = BOGUS_MD_STATE;
4774     ## Reconsume.
4775     redo A;
4776     }
4777     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4778     if ($is_space->{$self->{nc}}) {
4779     ## Stay in the state.
4780     !!!next-input-character;
4781     redo A;
4782     } elsif ($self->{nc} == 0x003E) { # >
4783     !!!parse-error (type => 'no notation name'); ## TODO: type
4784     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4785     !!!next-input-character;
4786     !!!emit ($self->{ct}); # ENTITY
4787     redo A;
4788     } elsif ($self->{nc} == -1) {
4789     !!!parse-error (type => 'unclosed md'); ## TODO: type
4790     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4791     !!!next-input-character;
4792     !!!emit ($self->{ct}); # ENTITY
4793     redo A;
4794     } else {
4795     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4796     $self->{state} = NOTATION_NAME_STATE;
4797     !!!next-input-character;
4798     redo A;
4799     }
4800     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4801     if ($is_space->{$self->{nc}}) {
4802 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4803 wakaba 1.18 !!!next-input-character;
4804     redo A;
4805     } elsif ($self->{nc} == 0x003E) { # >
4806     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4807     !!!next-input-character;
4808     !!!emit ($self->{ct}); # ENTITY
4809     redo A;
4810     } elsif ($self->{nc} == -1) {
4811     !!!parse-error (type => 'unclosed md'); ## TODO: type
4812     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4813     !!!next-input-character;
4814     !!!emit ($self->{ct}); # ENTITY
4815     redo A;
4816     } else {
4817     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4818     ## Stay in the state.
4819     !!!next-input-character;
4820     redo A;
4821     }
4822 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4823     if ($self->{nc} == 0x0022) { # "
4824 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4825 wakaba 1.19 !!!next-input-character;
4826     redo A;
4827     } elsif ($self->{nc} == 0x0026) { # &
4828     $self->{prev_state} = $self->{state};
4829     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4830     $self->{entity_add} = 0x0022; # "
4831     !!!next-input-character;
4832     redo A;
4833     ## TODO: %
4834     } elsif ($self->{nc} == -1) {
4835     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4836     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4837     ## Reconsume.
4838     !!!emit ($self->{ct}); # ENTITY
4839     redo A;
4840     } else {
4841     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4842     !!!next-input-character;
4843     redo A;
4844     }
4845     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4846     if ($self->{nc} == 0x0027) { # '
4847 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4848 wakaba 1.19 !!!next-input-character;
4849     redo A;
4850     } elsif ($self->{nc} == 0x0026) { # &
4851     $self->{prev_state} = $self->{state};
4852     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4853     $self->{entity_add} = 0x0027; # '
4854     !!!next-input-character;
4855     redo A;
4856     ## TODO: %
4857     } elsif ($self->{nc} == -1) {
4858     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4859     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4860     ## Reconsume.
4861     !!!emit ($self->{ct}); # ENTITY
4862     redo A;
4863     } else {
4864     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4865     !!!next-input-character;
4866     redo A;
4867     }
4868     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4869     if ($is_space->{$self->{nc}} or
4870     {
4871     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872     $self->{entity_add} => 1,
4873     }->{$self->{nc}}) {
4874 wakaba 1.22 !!!parse-error (type => 'bare ero',
4875     line => $self->{line_prev},
4876     column => $self->{column_prev}
4877     + ($self->{nc} == -1 ? 1 : 0));
4878 wakaba 1.19 ## Don't consume
4879     ## Return nothing.
4880     #
4881     } elsif ($self->{nc} == 0x0023) { # #
4882     $self->{ca} = $self->{ct};
4883     $self->{state} = ENTITY_HASH_STATE;
4884     $self->{kwd} = '#';
4885     !!!next-input-character;
4886     redo A;
4887     } else {
4888     #
4889     }
4890    
4891     $self->{ct}->{value} .= '&';
4892     $self->{state} = $self->{prev_state};
4893     ## Reconsume.
4894     redo A;
4895 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4896     if ($is_space->{$self->{nc}}) {
4897     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4898     !!!next-input-character;
4899     redo A;
4900     } elsif ($self->{nc} == 0x0028) { # (
4901     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4902     $self->{ct}->{content} = ['('];
4903     $self->{group_depth} = 1;
4904     !!!next-input-character;
4905     redo A;
4906     } elsif ($self->{nc} == 0x003E) { # >
4907     !!!parse-error (type => 'no md def'); ## TODO: type
4908     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4909     !!!next-input-character;
4910     !!!emit ($self->{ct}); # ELEMENT
4911     redo A;
4912     } elsif ($self->{nc} == -1) {
4913     !!!parse-error (type => 'unclosed md'); ## TODO: type
4914     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915     !!!next-input-character;
4916     !!!emit ($self->{ct}); # ELEMENT
4917     redo A;
4918     } else {
4919     $self->{ct}->{content} = [chr $self->{nc}];
4920     $self->{state} = CONTENT_KEYWORD_STATE;
4921     !!!next-input-character;
4922     redo A;
4923     }
4924     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4925     if ($is_space->{$self->{nc}}) {
4926     $self->{state} = AFTER_MD_DEF_STATE;
4927     !!!next-input-character;
4928     redo A;
4929     } elsif ($self->{nc} == 0x003E) { # >
4930     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4931     !!!next-input-character;
4932     !!!emit ($self->{ct}); # ELEMENT
4933     redo A;
4934     } elsif ($self->{nc} == -1) {
4935     !!!parse-error (type => 'unclosed md'); ## TODO: type
4936     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4937     !!!next-input-character;
4938     !!!emit ($self->{ct}); # ELEMENT
4939     redo A;
4940     } else {
4941     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4942     ## Stay in the state.
4943     !!!next-input-character;
4944     redo A;
4945     }
4946     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4947     if ($is_space->{$self->{nc}}) {
4948     ## Stay in the state.
4949     !!!next-input-character;
4950     redo A;
4951     } elsif ($self->{nc} == 0x0028) { # (
4952     $self->{group_depth}++;
4953     push @{$self->{ct}->{content}}, chr $self->{nc};
4954     ## Stay in the state.
4955     !!!next-input-character;
4956     redo A;
4957     } elsif ($self->{nc} == 0x007C or # |
4958     $self->{nc} == 0x002C) { # ,
4959     !!!parse-error (type => 'empty element name'); ## TODO: type
4960     ## Stay in the state.
4961     !!!next-input-character;
4962     redo A;
4963     } elsif ($self->{nc} == 0x0029) { # )
4964     !!!parse-error (type => 'empty element name'); ## TODO: type
4965     push @{$self->{ct}->{content}}, chr $self->{nc};
4966     $self->{group_depth}--;
4967     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4968     !!!next-input-character;
4969     redo A;
4970     } elsif ($self->{nc} == 0x003E) { # >
4971     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4972     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974     !!!next-input-character;
4975     !!!emit ($self->{ct}); # ELEMENT
4976     redo A;
4977     } elsif ($self->{nc} == -1) {
4978     !!!parse-error (type => 'unclosed md'); ## TODO: type
4979     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4981     !!!next-input-character;
4982     !!!emit ($self->{ct}); # ELEMENT
4983     redo A;
4984     } else {
4985     push @{$self->{ct}->{content}}, chr $self->{nc};
4986     $self->{state} = CM_ELEMENT_NAME_STATE;
4987     !!!next-input-character;
4988     redo A;
4989     }
4990     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4991     if ($is_space->{$self->{nc}}) {
4992     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4993     !!!next-input-character;
4994     redo A;
4995     } elsif ($self->{nc} == 0x002A or # *
4996     $self->{nc} == 0x002B or # +
4997     $self->{nc} == 0x003F) { # ?
4998     push @{$self->{ct}->{content}}, chr $self->{nc};
4999     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5000     !!!next-input-character;
5001     redo A;
5002     } elsif ($self->{nc} == 0x007C or # |
5003     $self->{nc} == 0x002C) { # ,
5004     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5005     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5006     !!!next-input-character;
5007     redo A;
5008     } elsif ($self->{nc} == 0x0029) { # )
5009     $self->{group_depth}--;
5010     push @{$self->{ct}->{content}}, chr $self->{nc};
5011     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5012     !!!next-input-character;
5013     redo A;
5014     } elsif ($self->{nc} == 0x003E) { # >
5015     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5016     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5018     !!!next-input-character;
5019     !!!emit ($self->{ct}); # ELEMENT
5020     redo A;
5021     } elsif ($self->{nc} == -1) {
5022     !!!parse-error (type => 'unclosed md'); ## TODO: type
5023     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5024     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5025     !!!next-input-character;
5026     !!!emit ($self->{ct}); # ELEMENT
5027     redo A;
5028     } else {
5029     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5030     ## Stay in the state.
5031     !!!next-input-character;
5032     redo A;
5033     }
5034     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5035     if ($is_space->{$self->{nc}}) {
5036     ## Stay in the state.
5037     !!!next-input-character;
5038     redo A;
5039     } elsif ($self->{nc} == 0x007C or # |
5040     $self->{nc} == 0x002C) { # ,
5041     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5042     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5043     !!!next-input-character;
5044     redo A;
5045     } elsif ($self->{nc} == 0x0029) { # )
5046     $self->{group_depth}--;
5047     push @{$self->{ct}->{content}}, chr $self->{nc};
5048     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5049     !!!next-input-character;
5050     redo A;
5051     } elsif ($self->{nc} == 0x003E) { # >
5052     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5053     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5054     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5055     !!!next-input-character;
5056     !!!emit ($self->{ct}); # ELEMENT
5057     redo A;
5058     } elsif ($self->{nc} == -1) {
5059     !!!parse-error (type => 'unclosed md'); ## TODO: type
5060     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5061     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5062     !!!next-input-character;
5063     !!!emit ($self->{ct}); # ELEMENT
5064     redo A;
5065     } else {
5066     !!!parse-error (type => 'after element name'); ## TODO: type
5067     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5068     $self->{state} = BOGUS_MD_STATE;
5069     !!!next-input-character;
5070     redo A;
5071     }
5072     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5073     if ($is_space->{$self->{nc}}) {
5074     if ($self->{group_depth}) {
5075     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5076     } else {
5077     $self->{state} = AFTER_MD_DEF_STATE;
5078     }
5079     !!!next-input-character;
5080     redo A;
5081     } elsif ($self->{nc} == 0x002A or # *
5082     $self->{nc} == 0x002B or # +
5083     $self->{nc} == 0x003F) { # ?
5084     push @{$self->{ct}->{content}}, chr $self->{nc};
5085     if ($self->{group_depth}) {
5086     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5087     } else {
5088     $self->{state} = AFTER_MD_DEF_STATE;
5089     }
5090     !!!next-input-character;
5091     redo A;
5092     } elsif ($self->{nc} == 0x0029) { # )
5093     if ($self->{group_depth}) {
5094     $self->{group_depth}--;
5095     push @{$self->{ct}->{content}}, chr $self->{nc};
5096     ## Stay in the state.
5097     !!!next-input-character;
5098     redo A;
5099     } else {
5100     !!!parse-error (type => 'string after md def'); ## TODO: type
5101     $self->{state} = BOGUS_MD_STATE;
5102     ## Reconsume.
5103     redo A;
5104     }
5105     } elsif ($self->{nc} == 0x003E) { # >
5106     if ($self->{group_depth}) {
5107     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5108     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5109     }
5110     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5111     !!!next-input-character;
5112     !!!emit ($self->{ct}); # ELEMENT
5113     redo A;
5114     } elsif ($self->{nc} == -1) {
5115     !!!parse-error (type => 'unclosed md'); ## TODO: type
5116     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5117     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5118     !!!next-input-character;
5119     !!!emit ($self->{ct}); # ELEMENT
5120     redo A;
5121     } else {
5122     if ($self->{group_depth}) {
5123     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5124     } else {
5125     !!!parse-error (type => 'string after md def'); ## TODO: type
5126     $self->{state} = BOGUS_MD_STATE;
5127     }
5128     ## Reconsume.
5129     redo A;
5130     }
5131     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5132 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5133     ## Stay in the state.
5134     !!!next-input-character;
5135     redo A;
5136     } elsif ($self->{nc} == 0x003E) { # >
5137     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5138     !!!next-input-character;
5139 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5140 wakaba 1.18 redo A;
5141     } elsif ($self->{nc} == -1) {
5142     !!!parse-error (type => 'unclosed md'); ## TODO: type
5143     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5144     !!!next-input-character;
5145 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5146 wakaba 1.18 redo A;
5147     } else {
5148 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5149 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5150     ## Reconsume.
5151     redo A;
5152     }
5153 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5154     if ($self->{nc} == 0x003E) { # >
5155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5156     !!!next-input-character;
5157     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5158     redo A;
5159     } elsif ($self->{nc} == -1) {
5160     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5161     ## Reconsume.
5162     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5163     redo A;
5164     } else {
5165     ## Stay in the state.
5166     !!!next-input-character;
5167     redo A;
5168     }
5169 wakaba 1.1 } else {
5170     die "$0: $self->{state}: Unknown state";
5171     }
5172     } # A
5173    
5174     die "$0: _get_next_token: unexpected case";
5175     } # _get_next_token
5176    
5177     1;
5178 wakaba 1.31 ## $Date: 2009/08/16 05:24:47 $
5179 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24