/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (hide annotations) (download) (as text)
Sun Oct 19 04:39:25 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.16: +113 -42 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	19 Oct 2008 04:38:53 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* notations-1.dat, notations-1.dat: Tests on lowercase markup
	declaration keywords are added.

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 04:37:30 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Make keywords 'ENTITY',
	'ELEMENT', 'ATTLIST', and 'NOTATION' ASCII case-insensitive.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.17 our $VERSION=do{my @r=(q$Revision: 1.16 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.16 sub BOGUS_MD_STATE () { 85 }
181 wakaba 1.8
182 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
183     ## list and descriptions)
184    
185     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
186     sub FOREIGN_EL () { 0b1_00000000000 }
187    
188     ## Character reference mappings
189    
190     my $charref_map = {
191     0x0D => 0x000A,
192     0x80 => 0x20AC,
193     0x81 => 0xFFFD,
194     0x82 => 0x201A,
195     0x83 => 0x0192,
196     0x84 => 0x201E,
197     0x85 => 0x2026,
198     0x86 => 0x2020,
199     0x87 => 0x2021,
200     0x88 => 0x02C6,
201     0x89 => 0x2030,
202     0x8A => 0x0160,
203     0x8B => 0x2039,
204     0x8C => 0x0152,
205     0x8D => 0xFFFD,
206     0x8E => 0x017D,
207     0x8F => 0xFFFD,
208     0x90 => 0xFFFD,
209     0x91 => 0x2018,
210     0x92 => 0x2019,
211     0x93 => 0x201C,
212     0x94 => 0x201D,
213     0x95 => 0x2022,
214     0x96 => 0x2013,
215     0x97 => 0x2014,
216     0x98 => 0x02DC,
217     0x99 => 0x2122,
218     0x9A => 0x0161,
219     0x9B => 0x203A,
220     0x9C => 0x0153,
221     0x9D => 0xFFFD,
222     0x9E => 0x017E,
223     0x9F => 0x0178,
224     }; # $charref_map
225     $charref_map->{$_} = 0xFFFD
226     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
227     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
228     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
229     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
230     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
231     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
232     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
233    
234     ## Implementations MUST act as if state machine in the spec
235    
236     sub _initialize_tokenizer ($) {
237     my $self = shift;
238    
239     ## NOTE: Fields set by |new| constructor:
240     #$self->{level}
241     #$self->{set_nc}
242     #$self->{parse_error}
243 wakaba 1.3 #$self->{is_xml} (if XML)
244 wakaba 1.1
245     $self->{state} = DATA_STATE; # MUST
246 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
247     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248 wakaba 1.1 #$self->{entity__value}; # initialized when used
249     #$self->{entity__match}; # initialized when used
250     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
251     undef $self->{ct}; # current token
252     undef $self->{ca}; # current attribute
253     undef $self->{last_stag_name}; # last emitted start tag name
254     #$self->{prev_state}; # initialized when used
255     delete $self->{self_closing};
256     $self->{char_buffer} = '';
257     $self->{char_buffer_pos} = 0;
258     $self->{nc} = -1; # next input character
259     #$self->{next_nc}
260     !!!next-input-character;
261     $self->{token} = [];
262     # $self->{escape}
263     } # _initialize_tokenizer
264    
265     ## A token has:
266     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
267 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
268 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
269     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
270 wakaba 1.11 ## ->{target} (PI_TOKEN)
271 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
272     ## ->{sysid} (DOCTYPE_TOKEN)
273     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
274     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
275     ## ->{name}
276     ## ->{value}
277     ## ->{has_reference} == 1 or 0
278 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
279     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
280 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
281 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
282 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
283    
284 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
285     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
286     ## while the token is pushed back to the stack.
287    
288     ## Emitted token MUST immediately be handled by the tree construction state.
289    
290     ## Before each step, UA MAY check to see if either one of the scripts in
291     ## "list of scripts that will execute as soon as possible" or the first
292     ## script in the "list of scripts that will execute asynchronously",
293     ## has completed loading. If one has, then it MUST be executed
294     ## and removed from the list.
295    
296     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
297     ## (This requirement was dropped from HTML5 spec, unfortunately.)
298    
299     my $is_space = {
300     0x0009 => 1, # CHARACTER TABULATION (HT)
301     0x000A => 1, # LINE FEED (LF)
302     #0x000B => 0, # LINE TABULATION (VT)
303 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
304 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
305     0x0020 => 1, # SPACE (SP)
306     };
307    
308     sub _get_next_token ($) {
309     my $self = shift;
310    
311     if ($self->{self_closing}) {
312     !!!parse-error (type => 'nestc', token => $self->{ct});
313     ## NOTE: The |self_closing| flag is only set by start tag token.
314     ## In addition, when a start tag token is emitted, it is always set to
315     ## |ct|.
316     delete $self->{self_closing};
317     }
318    
319     if (@{$self->{token}}) {
320     $self->{self_closing} = $self->{token}->[0]->{self_closing};
321     return shift @{$self->{token}};
322     }
323    
324     A: {
325     if ($self->{state} == PCDATA_STATE) {
326     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
327    
328     if ($self->{nc} == 0x0026) { # &
329     !!!cp (0.1);
330     ## NOTE: In the spec, the tokenizer is switched to the
331     ## "entity data state". In this implementation, the tokenizer
332     ## is switched to the |ENTITY_STATE|, which is an implementation
333     ## of the "consume a character reference" algorithm.
334     $self->{entity_add} = -1;
335     $self->{prev_state} = DATA_STATE;
336     $self->{state} = ENTITY_STATE;
337     !!!next-input-character;
338     redo A;
339     } elsif ($self->{nc} == 0x003C) { # <
340     !!!cp (0.2);
341     $self->{state} = TAG_OPEN_STATE;
342     !!!next-input-character;
343     redo A;
344     } elsif ($self->{nc} == -1) {
345     !!!cp (0.3);
346     !!!emit ({type => END_OF_FILE_TOKEN,
347     line => $self->{line}, column => $self->{column}});
348     last A; ## TODO: ok?
349     } else {
350     !!!cp (0.4);
351     #
352     }
353    
354     # Anything else
355     my $token = {type => CHARACTER_TOKEN,
356     data => chr $self->{nc},
357     line => $self->{line}, column => $self->{column},
358     };
359     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
360    
361     ## Stay in the state.
362     !!!next-input-character;
363     !!!emit ($token);
364     redo A;
365     } elsif ($self->{state} == DATA_STATE) {
366     $self->{s_kwd} = '' unless defined $self->{s_kwd};
367     if ($self->{nc} == 0x0026) { # &
368     $self->{s_kwd} = '';
369     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
370     not $self->{escape}) {
371     !!!cp (1);
372     ## NOTE: In the spec, the tokenizer is switched to the
373     ## "entity data state". In this implementation, the tokenizer
374     ## is switched to the |ENTITY_STATE|, which is an implementation
375     ## of the "consume a character reference" algorithm.
376     $self->{entity_add} = -1;
377     $self->{prev_state} = DATA_STATE;
378     $self->{state} = ENTITY_STATE;
379     !!!next-input-character;
380     redo A;
381     } else {
382     !!!cp (2);
383     #
384     }
385     } elsif ($self->{nc} == 0x002D) { # -
386     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
387 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
388 wakaba 1.1 !!!cp (3);
389     $self->{escape} = 1; # unless $self->{escape};
390     $self->{s_kwd} = '--';
391     #
392 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
393 wakaba 1.1 !!!cp (4);
394     $self->{s_kwd} = '--';
395     #
396 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
397     !!!cp (4.1);
398     $self->{s_kwd} .= '-';
399     #
400 wakaba 1.1 } else {
401     !!!cp (5);
402 wakaba 1.5 $self->{s_kwd} = '-';
403 wakaba 1.1 #
404     }
405     }
406    
407     #
408     } elsif ($self->{nc} == 0x0021) { # !
409     if (length $self->{s_kwd}) {
410     !!!cp (5.1);
411     $self->{s_kwd} .= '!';
412     #
413     } else {
414     !!!cp (5.2);
415     #$self->{s_kwd} = '';
416     #
417     }
418     #
419     } elsif ($self->{nc} == 0x003C) { # <
420     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
421     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
422     not $self->{escape})) {
423     !!!cp (6);
424     $self->{state} = TAG_OPEN_STATE;
425     !!!next-input-character;
426     redo A;
427     } else {
428     !!!cp (7);
429     $self->{s_kwd} = '';
430     #
431     }
432     } elsif ($self->{nc} == 0x003E) { # >
433     if ($self->{escape} and
434     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
435     if ($self->{s_kwd} eq '--') {
436     !!!cp (8);
437     delete $self->{escape};
438 wakaba 1.5 #
439 wakaba 1.1 } else {
440     !!!cp (9);
441 wakaba 1.5 #
442 wakaba 1.1 }
443 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444     !!!cp (9.1);
445     !!!parse-error (type => 'unmatched mse', ## TODO: type
446     line => $self->{line_prev},
447     column => $self->{column_prev} - 1);
448     #
449 wakaba 1.1 } else {
450     !!!cp (10);
451 wakaba 1.5 #
452 wakaba 1.1 }
453    
454     $self->{s_kwd} = '';
455     #
456 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
457     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458     !!!cp (10.1);
459     $self->{s_kwd} .= ']';
460     } elsif ($self->{s_kwd} eq ']]') {
461     !!!cp (10.2);
462     #
463     } else {
464     !!!cp (10.3);
465     $self->{s_kwd} = '';
466     }
467     #
468 wakaba 1.1 } elsif ($self->{nc} == -1) {
469     !!!cp (11);
470     $self->{s_kwd} = '';
471     !!!emit ({type => END_OF_FILE_TOKEN,
472     line => $self->{line}, column => $self->{column}});
473     last A; ## TODO: ok?
474     } else {
475     !!!cp (12);
476     $self->{s_kwd} = '';
477     #
478     }
479    
480     # Anything else
481     my $token = {type => CHARACTER_TOKEN,
482     data => chr $self->{nc},
483     line => $self->{line}, column => $self->{column},
484     };
485 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486 wakaba 1.1 length $token->{data})) {
487     $self->{s_kwd} = '';
488     }
489    
490     ## Stay in the data state.
491 wakaba 1.5 if (not $self->{is_xml} and
492     $self->{content_model} == PCDATA_CONTENT_MODEL) {
493 wakaba 1.1 !!!cp (13);
494     $self->{state} = PCDATA_STATE;
495     } else {
496     !!!cp (14);
497     ## Stay in the state.
498     }
499     !!!next-input-character;
500     !!!emit ($token);
501     redo A;
502     } elsif ($self->{state} == TAG_OPEN_STATE) {
503 wakaba 1.10 ## XML5: "tag state".
504    
505 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
506     if ($self->{nc} == 0x002F) { # /
507     !!!cp (15);
508     !!!next-input-character;
509     $self->{state} = CLOSE_TAG_OPEN_STATE;
510     redo A;
511     } elsif ($self->{nc} == 0x0021) { # !
512     !!!cp (15.1);
513 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
514 wakaba 1.1 #
515     } else {
516     !!!cp (16);
517 wakaba 1.12 $self->{s_kwd} = '';
518 wakaba 1.1 #
519     }
520    
521     ## reconsume
522     $self->{state} = DATA_STATE;
523     !!!emit ({type => CHARACTER_TOKEN, data => '<',
524     line => $self->{line_prev},
525     column => $self->{column_prev},
526     });
527     redo A;
528     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
529     if ($self->{nc} == 0x0021) { # !
530     !!!cp (17);
531     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
532     !!!next-input-character;
533     redo A;
534     } elsif ($self->{nc} == 0x002F) { # /
535     !!!cp (18);
536     $self->{state} = CLOSE_TAG_OPEN_STATE;
537     !!!next-input-character;
538     redo A;
539     } elsif (0x0041 <= $self->{nc} and
540     $self->{nc} <= 0x005A) { # A..Z
541     !!!cp (19);
542     $self->{ct}
543     = {type => START_TAG_TOKEN,
544 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545 wakaba 1.1 line => $self->{line_prev},
546     column => $self->{column_prev}};
547     $self->{state} = TAG_NAME_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif (0x0061 <= $self->{nc} and
551     $self->{nc} <= 0x007A) { # a..z
552     !!!cp (20);
553     $self->{ct} = {type => START_TAG_TOKEN,
554     tag_name => chr ($self->{nc}),
555     line => $self->{line_prev},
556     column => $self->{column_prev}};
557     $self->{state} = TAG_NAME_STATE;
558     !!!next-input-character;
559     redo A;
560     } elsif ($self->{nc} == 0x003E) { # >
561     !!!cp (21);
562     !!!parse-error (type => 'empty start tag',
563     line => $self->{line_prev},
564     column => $self->{column_prev});
565     $self->{state} = DATA_STATE;
566 wakaba 1.5 $self->{s_kwd} = '';
567 wakaba 1.1 !!!next-input-character;
568    
569     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
570     line => $self->{line_prev},
571     column => $self->{column_prev},
572     });
573    
574     redo A;
575     } elsif ($self->{nc} == 0x003F) { # ?
576 wakaba 1.8 if ($self->{is_xml}) {
577     !!!cp (22.1);
578     $self->{state} = PI_STATE;
579     !!!next-input-character;
580     redo A;
581     } else {
582     !!!cp (22);
583     !!!parse-error (type => 'pio',
584     line => $self->{line_prev},
585     column => $self->{column_prev});
586     $self->{state} = BOGUS_COMMENT_STATE;
587     $self->{ct} = {type => COMMENT_TOKEN, data => '',
588     line => $self->{line_prev},
589     column => $self->{column_prev},
590     };
591     ## $self->{nc} is intentionally left as is
592     redo A;
593     }
594 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
595 wakaba 1.1 !!!cp (23);
596     !!!parse-error (type => 'bare stago',
597     line => $self->{line_prev},
598     column => $self->{column_prev});
599     $self->{state} = DATA_STATE;
600 wakaba 1.5 $self->{s_kwd} = '';
601 wakaba 1.1 ## reconsume
602    
603     !!!emit ({type => CHARACTER_TOKEN, data => '<',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     });
607    
608     redo A;
609 wakaba 1.9 } else {
610     ## XML5: "<:" is a parse error.
611     !!!cp (23.1);
612     $self->{ct} = {type => START_TAG_TOKEN,
613     tag_name => chr ($self->{nc}),
614     line => $self->{line_prev},
615     column => $self->{column_prev}};
616     $self->{state} = TAG_NAME_STATE;
617     !!!next-input-character;
618     redo A;
619 wakaba 1.1 }
620     } else {
621     die "$0: $self->{content_model} in tag open";
622     }
623     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
624     ## NOTE: The "close tag open state" in the spec is implemented as
625     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
626    
627 wakaba 1.10 ## XML5: "end tag state".
628    
629 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
630     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
631     if (defined $self->{last_stag_name}) {
632     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
633 wakaba 1.12 $self->{kwd} = '';
634 wakaba 1.1 ## Reconsume.
635     redo A;
636     } else {
637     ## No start tag token has ever been emitted
638     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
639     !!!cp (28);
640     $self->{state} = DATA_STATE;
641 wakaba 1.5 $self->{s_kwd} = '';
642 wakaba 1.1 ## Reconsume.
643     !!!emit ({type => CHARACTER_TOKEN, data => '</',
644     line => $l, column => $c,
645     });
646     redo A;
647     }
648     }
649    
650     if (0x0041 <= $self->{nc} and
651     $self->{nc} <= 0x005A) { # A..Z
652     !!!cp (29);
653     $self->{ct}
654     = {type => END_TAG_TOKEN,
655 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
656 wakaba 1.1 line => $l, column => $c};
657     $self->{state} = TAG_NAME_STATE;
658     !!!next-input-character;
659     redo A;
660     } elsif (0x0061 <= $self->{nc} and
661     $self->{nc} <= 0x007A) { # a..z
662     !!!cp (30);
663     $self->{ct} = {type => END_TAG_TOKEN,
664     tag_name => chr ($self->{nc}),
665     line => $l, column => $c};
666     $self->{state} = TAG_NAME_STATE;
667     !!!next-input-character;
668     redo A;
669     } elsif ($self->{nc} == 0x003E) { # >
670     !!!parse-error (type => 'empty end tag',
671     line => $self->{line_prev}, ## "<" in "</>"
672     column => $self->{column_prev} - 1);
673     $self->{state} = DATA_STATE;
674 wakaba 1.5 $self->{s_kwd} = '';
675 wakaba 1.10 if ($self->{is_xml}) {
676     !!!cp (31);
677     ## XML5: No parse error.
678    
679     ## NOTE: This parser raises a parse error, since it supports
680     ## XML1, not XML5.
681    
682     ## NOTE: A short end tag token.
683     my $ct = {type => END_TAG_TOKEN,
684     tag_name => '',
685     line => $self->{line_prev},
686     column => $self->{column_prev} - 1,
687     };
688     !!!next-input-character;
689     !!!emit ($ct);
690     } else {
691     !!!cp (31.1);
692     !!!next-input-character;
693     }
694 wakaba 1.1 redo A;
695     } elsif ($self->{nc} == -1) {
696     !!!cp (32);
697     !!!parse-error (type => 'bare etago');
698 wakaba 1.5 $self->{s_kwd} = '';
699 wakaba 1.1 $self->{state} = DATA_STATE;
700     # reconsume
701    
702     !!!emit ({type => CHARACTER_TOKEN, data => '</',
703     line => $l, column => $c,
704     });
705    
706     redo A;
707 wakaba 1.10 } elsif (not $self->{is_xml} or
708     $is_space->{$self->{nc}}) {
709 wakaba 1.1 !!!cp (33);
710 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
711     line => $self->{line_prev}, # "<" of "</"
712     column => $self->{column_prev} - 1);
713 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
714     $self->{ct} = {type => COMMENT_TOKEN, data => '',
715     line => $self->{line_prev}, # "<" of "</"
716     column => $self->{column_prev} - 1,
717     };
718     ## NOTE: $self->{nc} is intentionally left as is.
719     ## Although the "anything else" case of the spec not explicitly
720     ## states that the next input character is to be reconsumed,
721     ## it will be included to the |data| of the comment token
722     ## generated from the bogus end tag, as defined in the
723     ## "bogus comment state" entry.
724     redo A;
725 wakaba 1.10 } else {
726     ## XML5: "</:" is a parse error.
727     !!!cp (30.1);
728     $self->{ct} = {type => END_TAG_TOKEN,
729     tag_name => chr ($self->{nc}),
730     line => $l, column => $c};
731     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
732     !!!next-input-character;
733     redo A;
734 wakaba 1.1 }
735     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
736 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
737 wakaba 1.1 if (length $ch) {
738     my $CH = $ch;
739     $ch =~ tr/a-z/A-Z/;
740     my $nch = chr $self->{nc};
741     if ($nch eq $ch or $nch eq $CH) {
742     !!!cp (24);
743     ## Stay in the state.
744 wakaba 1.12 $self->{kwd} .= $nch;
745 wakaba 1.1 !!!next-input-character;
746     redo A;
747     } else {
748     !!!cp (25);
749     $self->{state} = DATA_STATE;
750 wakaba 1.5 $self->{s_kwd} = '';
751 wakaba 1.1 ## Reconsume.
752     !!!emit ({type => CHARACTER_TOKEN,
753 wakaba 1.12 data => '</' . $self->{kwd},
754 wakaba 1.1 line => $self->{line_prev},
755 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
756 wakaba 1.1 });
757     redo A;
758     }
759     } else { # after "<{tag-name}"
760     unless ($is_space->{$self->{nc}} or
761     {
762     0x003E => 1, # >
763     0x002F => 1, # /
764     -1 => 1, # EOF
765     }->{$self->{nc}}) {
766     !!!cp (26);
767     ## Reconsume.
768     $self->{state} = DATA_STATE;
769 wakaba 1.5 $self->{s_kwd} = '';
770 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
771 wakaba 1.12 data => '</' . $self->{kwd},
772 wakaba 1.1 line => $self->{line_prev},
773 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
774 wakaba 1.1 });
775     redo A;
776     } else {
777     !!!cp (27);
778     $self->{ct}
779     = {type => END_TAG_TOKEN,
780     tag_name => $self->{last_stag_name},
781     line => $self->{line_prev},
782 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
783 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
784     ## Reconsume.
785     redo A;
786     }
787     }
788     } elsif ($self->{state} == TAG_NAME_STATE) {
789     if ($is_space->{$self->{nc}}) {
790     !!!cp (34);
791     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
792     !!!next-input-character;
793     redo A;
794     } elsif ($self->{nc} == 0x003E) { # >
795     if ($self->{ct}->{type} == START_TAG_TOKEN) {
796     !!!cp (35);
797     $self->{last_stag_name} = $self->{ct}->{tag_name};
798     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
799     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
800     #if ($self->{ct}->{attributes}) {
801     # ## NOTE: This should never be reached.
802     # !!! cp (36);
803     # !!! parse-error (type => 'end tag attribute');
804     #} else {
805     !!!cp (37);
806     #}
807     } else {
808     die "$0: $self->{ct}->{type}: Unknown token type";
809     }
810     $self->{state} = DATA_STATE;
811 wakaba 1.5 $self->{s_kwd} = '';
812 wakaba 1.1 !!!next-input-character;
813    
814     !!!emit ($self->{ct}); # start tag or end tag
815    
816     redo A;
817     } elsif (0x0041 <= $self->{nc} and
818     $self->{nc} <= 0x005A) { # A..Z
819     !!!cp (38);
820 wakaba 1.4 $self->{ct}->{tag_name}
821     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
822 wakaba 1.1 # start tag or end tag
823     ## Stay in this state
824     !!!next-input-character;
825     redo A;
826     } elsif ($self->{nc} == -1) {
827     !!!parse-error (type => 'unclosed tag');
828     if ($self->{ct}->{type} == START_TAG_TOKEN) {
829     !!!cp (39);
830     $self->{last_stag_name} = $self->{ct}->{tag_name};
831     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
832     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
833     #if ($self->{ct}->{attributes}) {
834     # ## NOTE: This state should never be reached.
835     # !!! cp (40);
836     # !!! parse-error (type => 'end tag attribute');
837     #} else {
838     !!!cp (41);
839     #}
840     } else {
841     die "$0: $self->{ct}->{type}: Unknown token type";
842     }
843     $self->{state} = DATA_STATE;
844 wakaba 1.5 $self->{s_kwd} = '';
845 wakaba 1.1 # reconsume
846    
847     !!!emit ($self->{ct}); # start tag or end tag
848    
849     redo A;
850     } elsif ($self->{nc} == 0x002F) { # /
851     !!!cp (42);
852     $self->{state} = SELF_CLOSING_START_TAG_STATE;
853     !!!next-input-character;
854     redo A;
855     } else {
856     !!!cp (44);
857     $self->{ct}->{tag_name} .= chr $self->{nc};
858     # start tag or end tag
859     ## Stay in the state
860     !!!next-input-character;
861     redo A;
862     }
863     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
864 wakaba 1.11 ## XML5: "Tag attribute name before state".
865    
866 wakaba 1.1 if ($is_space->{$self->{nc}}) {
867     !!!cp (45);
868     ## Stay in the state
869     !!!next-input-character;
870     redo A;
871     } elsif ($self->{nc} == 0x003E) { # >
872     if ($self->{ct}->{type} == START_TAG_TOKEN) {
873     !!!cp (46);
874     $self->{last_stag_name} = $self->{ct}->{tag_name};
875     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
876     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877     if ($self->{ct}->{attributes}) {
878     !!!cp (47);
879     !!!parse-error (type => 'end tag attribute');
880     } else {
881     !!!cp (48);
882     }
883     } else {
884     die "$0: $self->{ct}->{type}: Unknown token type";
885     }
886     $self->{state} = DATA_STATE;
887 wakaba 1.5 $self->{s_kwd} = '';
888 wakaba 1.1 !!!next-input-character;
889    
890     !!!emit ($self->{ct}); # start tag or end tag
891    
892     redo A;
893     } elsif (0x0041 <= $self->{nc} and
894     $self->{nc} <= 0x005A) { # A..Z
895     !!!cp (49);
896     $self->{ca}
897 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
898 wakaba 1.1 value => '',
899     line => $self->{line}, column => $self->{column}};
900     $self->{state} = ATTRIBUTE_NAME_STATE;
901     !!!next-input-character;
902     redo A;
903     } elsif ($self->{nc} == 0x002F) { # /
904     !!!cp (50);
905     $self->{state} = SELF_CLOSING_START_TAG_STATE;
906     !!!next-input-character;
907     redo A;
908     } elsif ($self->{nc} == -1) {
909     !!!parse-error (type => 'unclosed tag');
910     if ($self->{ct}->{type} == START_TAG_TOKEN) {
911     !!!cp (52);
912     $self->{last_stag_name} = $self->{ct}->{tag_name};
913     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
914     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
915     if ($self->{ct}->{attributes}) {
916     !!!cp (53);
917     !!!parse-error (type => 'end tag attribute');
918     } else {
919     !!!cp (54);
920     }
921     } else {
922     die "$0: $self->{ct}->{type}: Unknown token type";
923     }
924     $self->{state} = DATA_STATE;
925 wakaba 1.5 $self->{s_kwd} = '';
926 wakaba 1.1 # reconsume
927    
928     !!!emit ($self->{ct}); # start tag or end tag
929    
930     redo A;
931     } else {
932     if ({
933     0x0022 => 1, # "
934     0x0027 => 1, # '
935     0x003D => 1, # =
936     }->{$self->{nc}}) {
937     !!!cp (55);
938 wakaba 1.11 ## XML5: Not a parse error.
939 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
940     } else {
941     !!!cp (56);
942 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
943 wakaba 1.1 }
944     $self->{ca}
945     = {name => chr ($self->{nc}),
946     value => '',
947     line => $self->{line}, column => $self->{column}};
948     $self->{state} = ATTRIBUTE_NAME_STATE;
949     !!!next-input-character;
950     redo A;
951     }
952     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
953 wakaba 1.11 ## XML5: "Tag attribute name state".
954    
955 wakaba 1.1 my $before_leave = sub {
956     if (exists $self->{ct}->{attributes} # start tag or end tag
957     ->{$self->{ca}->{name}}) { # MUST
958     !!!cp (57);
959     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
960     ## Discard $self->{ca} # MUST
961     } else {
962     !!!cp (58);
963     $self->{ct}->{attributes}->{$self->{ca}->{name}}
964     = $self->{ca};
965 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
966 wakaba 1.1 }
967     }; # $before_leave
968    
969     if ($is_space->{$self->{nc}}) {
970     !!!cp (59);
971     $before_leave->();
972     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
973     !!!next-input-character;
974     redo A;
975     } elsif ($self->{nc} == 0x003D) { # =
976     !!!cp (60);
977     $before_leave->();
978     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{nc} == 0x003E) { # >
982 wakaba 1.11 if ($self->{is_xml}) {
983     !!!cp (60.1);
984     ## XML5: Not a parse error.
985     !!!parse-error (type => 'no attr value'); ## TODO: type
986     } else {
987     !!!cp (60.2);
988     }
989    
990 wakaba 1.1 $before_leave->();
991     if ($self->{ct}->{type} == START_TAG_TOKEN) {
992     !!!cp (61);
993     $self->{last_stag_name} = $self->{ct}->{tag_name};
994     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
995     !!!cp (62);
996     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
997     if ($self->{ct}->{attributes}) {
998     !!!parse-error (type => 'end tag attribute');
999     }
1000     } else {
1001     die "$0: $self->{ct}->{type}: Unknown token type";
1002     }
1003     $self->{state} = DATA_STATE;
1004 wakaba 1.5 $self->{s_kwd} = '';
1005 wakaba 1.1 !!!next-input-character;
1006    
1007     !!!emit ($self->{ct}); # start tag or end tag
1008    
1009     redo A;
1010     } elsif (0x0041 <= $self->{nc} and
1011     $self->{nc} <= 0x005A) { # A..Z
1012     !!!cp (63);
1013 wakaba 1.4 $self->{ca}->{name}
1014     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1015 wakaba 1.1 ## Stay in the state
1016     !!!next-input-character;
1017     redo A;
1018     } elsif ($self->{nc} == 0x002F) { # /
1019 wakaba 1.11 if ($self->{is_xml}) {
1020     !!!cp (64);
1021     ## XML5: Not a parse error.
1022     !!!parse-error (type => 'no attr value'); ## TODO: type
1023     } else {
1024     !!!cp (64.1);
1025     }
1026    
1027 wakaba 1.1 $before_leave->();
1028     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1029     !!!next-input-character;
1030     redo A;
1031     } elsif ($self->{nc} == -1) {
1032     !!!parse-error (type => 'unclosed tag');
1033     $before_leave->();
1034     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1035     !!!cp (66);
1036     $self->{last_stag_name} = $self->{ct}->{tag_name};
1037     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1038     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1039     if ($self->{ct}->{attributes}) {
1040     !!!cp (67);
1041     !!!parse-error (type => 'end tag attribute');
1042     } else {
1043     ## NOTE: This state should never be reached.
1044     !!!cp (68);
1045     }
1046     } else {
1047     die "$0: $self->{ct}->{type}: Unknown token type";
1048     }
1049     $self->{state} = DATA_STATE;
1050 wakaba 1.5 $self->{s_kwd} = '';
1051 wakaba 1.1 # reconsume
1052    
1053     !!!emit ($self->{ct}); # start tag or end tag
1054    
1055     redo A;
1056     } else {
1057     if ($self->{nc} == 0x0022 or # "
1058     $self->{nc} == 0x0027) { # '
1059     !!!cp (69);
1060 wakaba 1.11 ## XML5: Not a parse error.
1061 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1062     } else {
1063     !!!cp (70);
1064     }
1065     $self->{ca}->{name} .= chr ($self->{nc});
1066     ## Stay in the state
1067     !!!next-input-character;
1068     redo A;
1069     }
1070     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1071 wakaba 1.11 ## XML5: "Tag attribute name after state".
1072    
1073 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1074     !!!cp (71);
1075     ## Stay in the state
1076     !!!next-input-character;
1077     redo A;
1078     } elsif ($self->{nc} == 0x003D) { # =
1079     !!!cp (72);
1080     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1081     !!!next-input-character;
1082     redo A;
1083     } elsif ($self->{nc} == 0x003E) { # >
1084 wakaba 1.11 if ($self->{is_xml}) {
1085     !!!cp (72.1);
1086     ## XML5: Not a parse error.
1087     !!!parse-error (type => 'no attr value'); ## TODO: type
1088     } else {
1089     !!!cp (72.2);
1090     }
1091    
1092 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1093     !!!cp (73);
1094     $self->{last_stag_name} = $self->{ct}->{tag_name};
1095     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1096     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1097     if ($self->{ct}->{attributes}) {
1098     !!!cp (74);
1099     !!!parse-error (type => 'end tag attribute');
1100     } else {
1101     ## NOTE: This state should never be reached.
1102     !!!cp (75);
1103     }
1104     } else {
1105     die "$0: $self->{ct}->{type}: Unknown token type";
1106     }
1107     $self->{state} = DATA_STATE;
1108 wakaba 1.5 $self->{s_kwd} = '';
1109 wakaba 1.1 !!!next-input-character;
1110    
1111     !!!emit ($self->{ct}); # start tag or end tag
1112    
1113     redo A;
1114     } elsif (0x0041 <= $self->{nc} and
1115     $self->{nc} <= 0x005A) { # A..Z
1116     !!!cp (76);
1117     $self->{ca}
1118 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1119 wakaba 1.1 value => '',
1120     line => $self->{line}, column => $self->{column}};
1121     $self->{state} = ATTRIBUTE_NAME_STATE;
1122     !!!next-input-character;
1123     redo A;
1124     } elsif ($self->{nc} == 0x002F) { # /
1125 wakaba 1.11 if ($self->{is_xml}) {
1126     !!!cp (77);
1127     ## XML5: Not a parse error.
1128     !!!parse-error (type => 'no attr value'); ## TODO: type
1129     } else {
1130     !!!cp (77.1);
1131     }
1132    
1133 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1134     !!!next-input-character;
1135     redo A;
1136     } elsif ($self->{nc} == -1) {
1137     !!!parse-error (type => 'unclosed tag');
1138     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1139     !!!cp (79);
1140     $self->{last_stag_name} = $self->{ct}->{tag_name};
1141     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1142     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1143     if ($self->{ct}->{attributes}) {
1144     !!!cp (80);
1145     !!!parse-error (type => 'end tag attribute');
1146     } else {
1147     ## NOTE: This state should never be reached.
1148     !!!cp (81);
1149     }
1150     } else {
1151     die "$0: $self->{ct}->{type}: Unknown token type";
1152     }
1153 wakaba 1.5 $self->{s_kwd} = '';
1154 wakaba 1.1 $self->{state} = DATA_STATE;
1155     # reconsume
1156    
1157     !!!emit ($self->{ct}); # start tag or end tag
1158    
1159     redo A;
1160     } else {
1161 wakaba 1.11 if ($self->{is_xml}) {
1162     !!!cp (78.1);
1163     ## XML5: Not a parse error.
1164     !!!parse-error (type => 'no attr value'); ## TODO: type
1165     } else {
1166     !!!cp (78.2);
1167     }
1168    
1169 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1170     $self->{nc} == 0x0027) { # '
1171     !!!cp (78);
1172 wakaba 1.11 ## XML5: Not a parse error.
1173 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1174     } else {
1175     !!!cp (82);
1176     }
1177     $self->{ca}
1178     = {name => chr ($self->{nc}),
1179     value => '',
1180     line => $self->{line}, column => $self->{column}};
1181     $self->{state} = ATTRIBUTE_NAME_STATE;
1182     !!!next-input-character;
1183     redo A;
1184     }
1185     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1186 wakaba 1.11 ## XML5: "Tag attribute value before state".
1187    
1188 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1189     !!!cp (83);
1190     ## Stay in the state
1191     !!!next-input-character;
1192     redo A;
1193     } elsif ($self->{nc} == 0x0022) { # "
1194     !!!cp (84);
1195     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1196     !!!next-input-character;
1197     redo A;
1198     } elsif ($self->{nc} == 0x0026) { # &
1199     !!!cp (85);
1200     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1201     ## reconsume
1202     redo A;
1203     } elsif ($self->{nc} == 0x0027) { # '
1204     !!!cp (86);
1205     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1206     !!!next-input-character;
1207     redo A;
1208     } elsif ($self->{nc} == 0x003E) { # >
1209     !!!parse-error (type => 'empty unquoted attribute value');
1210     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211     !!!cp (87);
1212     $self->{last_stag_name} = $self->{ct}->{tag_name};
1213     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215     if ($self->{ct}->{attributes}) {
1216     !!!cp (88);
1217     !!!parse-error (type => 'end tag attribute');
1218     } else {
1219     ## NOTE: This state should never be reached.
1220     !!!cp (89);
1221     }
1222     } else {
1223     die "$0: $self->{ct}->{type}: Unknown token type";
1224     }
1225     $self->{state} = DATA_STATE;
1226 wakaba 1.5 $self->{s_kwd} = '';
1227 wakaba 1.1 !!!next-input-character;
1228    
1229     !!!emit ($self->{ct}); # start tag or end tag
1230    
1231     redo A;
1232     } elsif ($self->{nc} == -1) {
1233     !!!parse-error (type => 'unclosed tag');
1234     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1235     !!!cp (90);
1236     $self->{last_stag_name} = $self->{ct}->{tag_name};
1237     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1238     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1239     if ($self->{ct}->{attributes}) {
1240     !!!cp (91);
1241     !!!parse-error (type => 'end tag attribute');
1242     } else {
1243     ## NOTE: This state should never be reached.
1244     !!!cp (92);
1245     }
1246     } else {
1247     die "$0: $self->{ct}->{type}: Unknown token type";
1248     }
1249     $self->{state} = DATA_STATE;
1250 wakaba 1.5 $self->{s_kwd} = '';
1251 wakaba 1.1 ## reconsume
1252    
1253     !!!emit ($self->{ct}); # start tag or end tag
1254    
1255     redo A;
1256     } else {
1257     if ($self->{nc} == 0x003D) { # =
1258     !!!cp (93);
1259 wakaba 1.11 ## XML5: Not a parse error.
1260 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1261 wakaba 1.11 } elsif ($self->{is_xml}) {
1262     !!!cp (93.1);
1263     ## XML5: No parse error.
1264     !!!parse-error (type => 'unquoted attr value'); ## TODO
1265 wakaba 1.1 } else {
1266     !!!cp (94);
1267     }
1268     $self->{ca}->{value} .= chr ($self->{nc});
1269     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1270     !!!next-input-character;
1271     redo A;
1272     }
1273     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1274 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1275     ## ATTLIST attribute value double quoted state".
1276 wakaba 1.11
1277 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1278 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1279     !!!cp (95.1);
1280     ## XML5: "DOCTYPE ATTLIST name after state".
1281     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1282     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1283     } else {
1284     !!!cp (95);
1285     ## XML5: "Tag attribute name before state".
1286     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1287     }
1288 wakaba 1.1 !!!next-input-character;
1289     redo A;
1290     } elsif ($self->{nc} == 0x0026) { # &
1291     !!!cp (96);
1292 wakaba 1.11 ## XML5: Not defined yet.
1293    
1294 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1295     ## "entity in attribute value state". In this implementation, the
1296     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1297     ## implementation of the "consume a character reference" algorithm.
1298     $self->{prev_state} = $self->{state};
1299     $self->{entity_add} = 0x0022; # "
1300     $self->{state} = ENTITY_STATE;
1301     !!!next-input-character;
1302     redo A;
1303     } elsif ($self->{nc} == -1) {
1304     !!!parse-error (type => 'unclosed attribute value');
1305     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1306     !!!cp (97);
1307     $self->{last_stag_name} = $self->{ct}->{tag_name};
1308 wakaba 1.15
1309     $self->{state} = DATA_STATE;
1310     $self->{s_kwd} = '';
1311     ## reconsume
1312     !!!emit ($self->{ct}); # start tag
1313     redo A;
1314 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1315     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1316     if ($self->{ct}->{attributes}) {
1317     !!!cp (98);
1318     !!!parse-error (type => 'end tag attribute');
1319     } else {
1320     ## NOTE: This state should never be reached.
1321     !!!cp (99);
1322     }
1323 wakaba 1.15
1324     $self->{state} = DATA_STATE;
1325     $self->{s_kwd} = '';
1326     ## reconsume
1327     !!!emit ($self->{ct}); # end tag
1328     redo A;
1329     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1330     ## XML5: No parse error above; not defined yet.
1331     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1332     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1333     ## Reconsume.
1334     !!!emit ($self->{ct}); # ATTLIST
1335     redo A;
1336 wakaba 1.1 } else {
1337     die "$0: $self->{ct}->{type}: Unknown token type";
1338     }
1339     } else {
1340 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1341 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1342     !!!cp (100);
1343     ## XML5: Not a parse error.
1344     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1345     } else {
1346     !!!cp (100.1);
1347     }
1348 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1349     $self->{read_until}->($self->{ca}->{value},
1350 wakaba 1.11 q["&<],
1351 wakaba 1.1 length $self->{ca}->{value});
1352    
1353     ## Stay in the state
1354     !!!next-input-character;
1355     redo A;
1356     }
1357     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1358 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1359     ## ATTLIST attribute value single quoted state".
1360 wakaba 1.11
1361 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1362 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1363     !!!cp (101.1);
1364     ## XML5: "DOCTYPE ATTLIST name after state".
1365     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1366     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1367     } else {
1368     !!!cp (101);
1369     ## XML5: "Before attribute name state" (sic).
1370     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1371     }
1372 wakaba 1.1 !!!next-input-character;
1373     redo A;
1374     } elsif ($self->{nc} == 0x0026) { # &
1375     !!!cp (102);
1376 wakaba 1.11 ## XML5: Not defined yet.
1377    
1378 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1379     ## "entity in attribute value state". In this implementation, the
1380     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1381     ## implementation of the "consume a character reference" algorithm.
1382     $self->{entity_add} = 0x0027; # '
1383     $self->{prev_state} = $self->{state};
1384     $self->{state} = ENTITY_STATE;
1385     !!!next-input-character;
1386     redo A;
1387     } elsif ($self->{nc} == -1) {
1388     !!!parse-error (type => 'unclosed attribute value');
1389     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1390     !!!cp (103);
1391     $self->{last_stag_name} = $self->{ct}->{tag_name};
1392 wakaba 1.15
1393     $self->{state} = DATA_STATE;
1394     $self->{s_kwd} = '';
1395     ## reconsume
1396     !!!emit ($self->{ct}); # start tag
1397     redo A;
1398 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1399     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1400     if ($self->{ct}->{attributes}) {
1401     !!!cp (104);
1402     !!!parse-error (type => 'end tag attribute');
1403     } else {
1404     ## NOTE: This state should never be reached.
1405     !!!cp (105);
1406     }
1407 wakaba 1.15
1408     $self->{state} = DATA_STATE;
1409     $self->{s_kwd} = '';
1410     ## reconsume
1411     !!!emit ($self->{ct}); # end tag
1412     redo A;
1413     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1414     ## XML5: No parse error above; not defined yet.
1415     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1416     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1417     ## Reconsume.
1418     !!!emit ($self->{ct}); # ATTLIST
1419     redo A;
1420 wakaba 1.1 } else {
1421     die "$0: $self->{ct}->{type}: Unknown token type";
1422     }
1423     } else {
1424 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1425 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1426     !!!cp (106);
1427     ## XML5: Not a parse error.
1428     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1429     } else {
1430     !!!cp (106.1);
1431     }
1432 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1433     $self->{read_until}->($self->{ca}->{value},
1434 wakaba 1.11 q['&<],
1435 wakaba 1.1 length $self->{ca}->{value});
1436    
1437     ## Stay in the state
1438     !!!next-input-character;
1439     redo A;
1440     }
1441     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1442 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1443    
1444 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1445 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1446     !!!cp (107.1);
1447     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1448     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1449     } else {
1450     !!!cp (107);
1451     ## XML5: "Tag attribute name before state".
1452     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1453     }
1454 wakaba 1.1 !!!next-input-character;
1455     redo A;
1456     } elsif ($self->{nc} == 0x0026) { # &
1457     !!!cp (108);
1458 wakaba 1.11
1459     ## XML5: Not defined yet.
1460    
1461 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1462     ## "entity in attribute value state". In this implementation, the
1463     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1464     ## implementation of the "consume a character reference" algorithm.
1465     $self->{entity_add} = -1;
1466     $self->{prev_state} = $self->{state};
1467     $self->{state} = ENTITY_STATE;
1468     !!!next-input-character;
1469     redo A;
1470     } elsif ($self->{nc} == 0x003E) { # >
1471     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1472     !!!cp (109);
1473     $self->{last_stag_name} = $self->{ct}->{tag_name};
1474 wakaba 1.15
1475     $self->{state} = DATA_STATE;
1476     $self->{s_kwd} = '';
1477     !!!next-input-character;
1478     !!!emit ($self->{ct}); # start tag
1479     redo A;
1480 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1481     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1482     if ($self->{ct}->{attributes}) {
1483     !!!cp (110);
1484     !!!parse-error (type => 'end tag attribute');
1485     } else {
1486     ## NOTE: This state should never be reached.
1487     !!!cp (111);
1488     }
1489 wakaba 1.15
1490     $self->{state} = DATA_STATE;
1491     $self->{s_kwd} = '';
1492     !!!next-input-character;
1493     !!!emit ($self->{ct}); # end tag
1494     redo A;
1495     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1496     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1498     !!!next-input-character;
1499     !!!emit ($self->{ct}); # ATTLIST
1500     redo A;
1501 wakaba 1.1 } else {
1502     die "$0: $self->{ct}->{type}: Unknown token type";
1503     }
1504     } elsif ($self->{nc} == -1) {
1505     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1506     !!!cp (112);
1507 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1508 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1509 wakaba 1.15
1510     $self->{state} = DATA_STATE;
1511     $self->{s_kwd} = '';
1512     ## reconsume
1513     !!!emit ($self->{ct}); # start tag
1514     redo A;
1515 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1516 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1517 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1518     if ($self->{ct}->{attributes}) {
1519     !!!cp (113);
1520     !!!parse-error (type => 'end tag attribute');
1521     } else {
1522     ## NOTE: This state should never be reached.
1523     !!!cp (114);
1524     }
1525 wakaba 1.15
1526     $self->{state} = DATA_STATE;
1527     $self->{s_kwd} = '';
1528     ## reconsume
1529     !!!emit ($self->{ct}); # end tag
1530     redo A;
1531     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1532     !!!parse-error (type => 'unclosed md'); ## TODO: type
1533     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1534     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1535     ## Reconsume.
1536     !!!emit ($self->{ct}); # ATTLIST
1537     redo A;
1538 wakaba 1.1 } else {
1539     die "$0: $self->{ct}->{type}: Unknown token type";
1540     }
1541     } else {
1542     if ({
1543     0x0022 => 1, # "
1544     0x0027 => 1, # '
1545     0x003D => 1, # =
1546     }->{$self->{nc}}) {
1547     !!!cp (115);
1548 wakaba 1.11 ## XML5: Not a parse error.
1549 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1550     } else {
1551     !!!cp (116);
1552     }
1553     $self->{ca}->{value} .= chr ($self->{nc});
1554     $self->{read_until}->($self->{ca}->{value},
1555     q["'=& >],
1556     length $self->{ca}->{value});
1557    
1558     ## Stay in the state
1559     !!!next-input-character;
1560     redo A;
1561     }
1562     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1563     if ($is_space->{$self->{nc}}) {
1564     !!!cp (118);
1565     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1566     !!!next-input-character;
1567     redo A;
1568     } elsif ($self->{nc} == 0x003E) { # >
1569     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1570     !!!cp (119);
1571     $self->{last_stag_name} = $self->{ct}->{tag_name};
1572     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1573     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1574     if ($self->{ct}->{attributes}) {
1575     !!!cp (120);
1576     !!!parse-error (type => 'end tag attribute');
1577     } else {
1578     ## NOTE: This state should never be reached.
1579     !!!cp (121);
1580     }
1581     } else {
1582     die "$0: $self->{ct}->{type}: Unknown token type";
1583     }
1584     $self->{state} = DATA_STATE;
1585 wakaba 1.5 $self->{s_kwd} = '';
1586 wakaba 1.1 !!!next-input-character;
1587    
1588     !!!emit ($self->{ct}); # start tag or end tag
1589    
1590     redo A;
1591     } elsif ($self->{nc} == 0x002F) { # /
1592     !!!cp (122);
1593     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1594     !!!next-input-character;
1595     redo A;
1596     } elsif ($self->{nc} == -1) {
1597     !!!parse-error (type => 'unclosed tag');
1598     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1599     !!!cp (122.3);
1600     $self->{last_stag_name} = $self->{ct}->{tag_name};
1601     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1602     if ($self->{ct}->{attributes}) {
1603     !!!cp (122.1);
1604     !!!parse-error (type => 'end tag attribute');
1605     } else {
1606     ## NOTE: This state should never be reached.
1607     !!!cp (122.2);
1608     }
1609     } else {
1610     die "$0: $self->{ct}->{type}: Unknown token type";
1611     }
1612     $self->{state} = DATA_STATE;
1613 wakaba 1.5 $self->{s_kwd} = '';
1614 wakaba 1.1 ## Reconsume.
1615     !!!emit ($self->{ct}); # start tag or end tag
1616     redo A;
1617     } else {
1618     !!!cp ('124.1');
1619     !!!parse-error (type => 'no space between attributes');
1620     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1621     ## reconsume
1622     redo A;
1623     }
1624     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1625 wakaba 1.11 ## XML5: "Empty tag state".
1626    
1627 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1628     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1629     !!!cp ('124.2');
1630     !!!parse-error (type => 'nestc', token => $self->{ct});
1631     ## TODO: Different type than slash in start tag
1632     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1633     if ($self->{ct}->{attributes}) {
1634     !!!cp ('124.4');
1635     !!!parse-error (type => 'end tag attribute');
1636     } else {
1637     !!!cp ('124.5');
1638     }
1639     ## TODO: Test |<title></title/>|
1640     } else {
1641     !!!cp ('124.3');
1642     $self->{self_closing} = 1;
1643     }
1644    
1645     $self->{state} = DATA_STATE;
1646 wakaba 1.5 $self->{s_kwd} = '';
1647 wakaba 1.1 !!!next-input-character;
1648    
1649     !!!emit ($self->{ct}); # start tag or end tag
1650    
1651     redo A;
1652     } elsif ($self->{nc} == -1) {
1653     !!!parse-error (type => 'unclosed tag');
1654     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1655     !!!cp (124.7);
1656     $self->{last_stag_name} = $self->{ct}->{tag_name};
1657     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1658     if ($self->{ct}->{attributes}) {
1659     !!!cp (124.5);
1660     !!!parse-error (type => 'end tag attribute');
1661     } else {
1662     ## NOTE: This state should never be reached.
1663     !!!cp (124.6);
1664     }
1665     } else {
1666     die "$0: $self->{ct}->{type}: Unknown token type";
1667     }
1668 wakaba 1.11 ## XML5: "Tag attribute name before state".
1669 wakaba 1.1 $self->{state} = DATA_STATE;
1670 wakaba 1.5 $self->{s_kwd} = '';
1671 wakaba 1.1 ## Reconsume.
1672     !!!emit ($self->{ct}); # start tag or end tag
1673     redo A;
1674     } else {
1675     !!!cp ('124.4');
1676     !!!parse-error (type => 'nestc');
1677     ## TODO: This error type is wrong.
1678     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1679     ## Reconsume.
1680     redo A;
1681     }
1682     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1683 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1684    
1685 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1686     ## consumes characters one-by-one basis.
1687    
1688     if ($self->{nc} == 0x003E) { # >
1689 wakaba 1.13 if ($self->{in_subset}) {
1690     !!!cp (123);
1691     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1692     } else {
1693     !!!cp (124);
1694     $self->{state} = DATA_STATE;
1695     $self->{s_kwd} = '';
1696     }
1697 wakaba 1.1 !!!next-input-character;
1698    
1699     !!!emit ($self->{ct}); # comment
1700     redo A;
1701     } elsif ($self->{nc} == -1) {
1702 wakaba 1.13 if ($self->{in_subset}) {
1703     !!!cp (125.1);
1704     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1705     } else {
1706     !!!cp (125);
1707     $self->{state} = DATA_STATE;
1708     $self->{s_kwd} = '';
1709     }
1710 wakaba 1.1 ## reconsume
1711    
1712     !!!emit ($self->{ct}); # comment
1713     redo A;
1714     } else {
1715     !!!cp (126);
1716     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1717     $self->{read_until}->($self->{ct}->{data},
1718     q[>],
1719     length $self->{ct}->{data});
1720    
1721     ## Stay in the state.
1722     !!!next-input-character;
1723     redo A;
1724     }
1725     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1726 wakaba 1.14 ## XML5: "Markup declaration state".
1727 wakaba 1.1
1728     if ($self->{nc} == 0x002D) { # -
1729     !!!cp (133);
1730     $self->{state} = MD_HYPHEN_STATE;
1731     !!!next-input-character;
1732     redo A;
1733     } elsif ($self->{nc} == 0x0044 or # D
1734     $self->{nc} == 0x0064) { # d
1735     ## ASCII case-insensitive.
1736     !!!cp (130);
1737     $self->{state} = MD_DOCTYPE_STATE;
1738 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1739 wakaba 1.1 !!!next-input-character;
1740     redo A;
1741 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1742     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1743     $self->{is_xml}) and
1744 wakaba 1.1 $self->{nc} == 0x005B) { # [
1745     !!!cp (135.4);
1746     $self->{state} = MD_CDATA_STATE;
1747 wakaba 1.12 $self->{kwd} = '[';
1748 wakaba 1.1 !!!next-input-character;
1749     redo A;
1750     } else {
1751     !!!cp (136);
1752     }
1753    
1754     !!!parse-error (type => 'bogus comment',
1755     line => $self->{line_prev},
1756     column => $self->{column_prev} - 1);
1757     ## Reconsume.
1758     $self->{state} = BOGUS_COMMENT_STATE;
1759     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1760     line => $self->{line_prev},
1761     column => $self->{column_prev} - 1,
1762     };
1763     redo A;
1764     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1765     if ($self->{nc} == 0x002D) { # -
1766     !!!cp (127);
1767     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1768     line => $self->{line_prev},
1769     column => $self->{column_prev} - 2,
1770     };
1771 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1772 wakaba 1.1 !!!next-input-character;
1773     redo A;
1774     } else {
1775     !!!cp (128);
1776     !!!parse-error (type => 'bogus comment',
1777     line => $self->{line_prev},
1778     column => $self->{column_prev} - 2);
1779     $self->{state} = BOGUS_COMMENT_STATE;
1780     ## Reconsume.
1781     $self->{ct} = {type => COMMENT_TOKEN,
1782     data => '-',
1783     line => $self->{line_prev},
1784     column => $self->{column_prev} - 2,
1785     };
1786     redo A;
1787     }
1788     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1789     ## ASCII case-insensitive.
1790     if ($self->{nc} == [
1791     undef,
1792     0x004F, # O
1793     0x0043, # C
1794     0x0054, # T
1795     0x0059, # Y
1796     0x0050, # P
1797 wakaba 1.12 ]->[length $self->{kwd}] or
1798 wakaba 1.1 $self->{nc} == [
1799     undef,
1800     0x006F, # o
1801     0x0063, # c
1802     0x0074, # t
1803     0x0079, # y
1804     0x0070, # p
1805 wakaba 1.12 ]->[length $self->{kwd}]) {
1806 wakaba 1.1 !!!cp (131);
1807     ## Stay in the state.
1808 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1809 wakaba 1.1 !!!next-input-character;
1810     redo A;
1811 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1812 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1813     $self->{nc} == 0x0065)) { # e
1814 wakaba 1.12 if ($self->{is_xml} and
1815     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1816 wakaba 1.10 !!!cp (129);
1817     ## XML5: case-sensitive.
1818     !!!parse-error (type => 'lowercase keyword', ## TODO
1819     text => 'DOCTYPE',
1820     line => $self->{line_prev},
1821     column => $self->{column_prev} - 5);
1822     } else {
1823     !!!cp (129.1);
1824     }
1825 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1826     $self->{ct} = {type => DOCTYPE_TOKEN,
1827     quirks => 1,
1828     line => $self->{line_prev},
1829     column => $self->{column_prev} - 7,
1830     };
1831     !!!next-input-character;
1832     redo A;
1833     } else {
1834     !!!cp (132);
1835     !!!parse-error (type => 'bogus comment',
1836     line => $self->{line_prev},
1837 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1838 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1839     ## Reconsume.
1840     $self->{ct} = {type => COMMENT_TOKEN,
1841 wakaba 1.12 data => $self->{kwd},
1842 wakaba 1.1 line => $self->{line_prev},
1843 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1844 wakaba 1.1 };
1845     redo A;
1846     }
1847     } elsif ($self->{state} == MD_CDATA_STATE) {
1848     if ($self->{nc} == {
1849     '[' => 0x0043, # C
1850     '[C' => 0x0044, # D
1851     '[CD' => 0x0041, # A
1852     '[CDA' => 0x0054, # T
1853     '[CDAT' => 0x0041, # A
1854 wakaba 1.12 }->{$self->{kwd}}) {
1855 wakaba 1.1 !!!cp (135.1);
1856     ## Stay in the state.
1857 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1858 wakaba 1.1 !!!next-input-character;
1859     redo A;
1860 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1861 wakaba 1.1 $self->{nc} == 0x005B) { # [
1862 wakaba 1.6 if ($self->{is_xml} and
1863     not $self->{tainted} and
1864     @{$self->{open_elements} or []} == 0) {
1865 wakaba 1.8 !!!cp (135.2);
1866 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1867     line => $self->{line_prev},
1868     column => $self->{column_prev} - 7);
1869     $self->{tainted} = 1;
1870 wakaba 1.8 } else {
1871     !!!cp (135.21);
1872 wakaba 1.6 }
1873    
1874 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1875     data => '',
1876     line => $self->{line_prev},
1877     column => $self->{column_prev} - 7};
1878     $self->{state} = CDATA_SECTION_STATE;
1879     !!!next-input-character;
1880     redo A;
1881     } else {
1882     !!!cp (135.3);
1883     !!!parse-error (type => 'bogus comment',
1884     line => $self->{line_prev},
1885 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1886 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1887     ## Reconsume.
1888     $self->{ct} = {type => COMMENT_TOKEN,
1889 wakaba 1.12 data => $self->{kwd},
1890 wakaba 1.1 line => $self->{line_prev},
1891 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1892 wakaba 1.1 };
1893     redo A;
1894     }
1895     } elsif ($self->{state} == COMMENT_START_STATE) {
1896     if ($self->{nc} == 0x002D) { # -
1897     !!!cp (137);
1898     $self->{state} = COMMENT_START_DASH_STATE;
1899     !!!next-input-character;
1900     redo A;
1901     } elsif ($self->{nc} == 0x003E) { # >
1902     !!!parse-error (type => 'bogus comment');
1903 wakaba 1.13 if ($self->{in_subset}) {
1904     !!!cp (138.1);
1905     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1906     } else {
1907     !!!cp (138);
1908     $self->{state} = DATA_STATE;
1909     $self->{s_kwd} = '';
1910     }
1911 wakaba 1.1 !!!next-input-character;
1912    
1913     !!!emit ($self->{ct}); # comment
1914    
1915     redo A;
1916     } elsif ($self->{nc} == -1) {
1917     !!!parse-error (type => 'unclosed comment');
1918 wakaba 1.13 if ($self->{in_subset}) {
1919     !!!cp (139.1);
1920     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1921     } else {
1922     !!!cp (139);
1923     $self->{state} = DATA_STATE;
1924     $self->{s_kwd} = '';
1925     }
1926 wakaba 1.1 ## reconsume
1927    
1928     !!!emit ($self->{ct}); # comment
1929    
1930     redo A;
1931     } else {
1932     !!!cp (140);
1933     $self->{ct}->{data} # comment
1934     .= chr ($self->{nc});
1935     $self->{state} = COMMENT_STATE;
1936     !!!next-input-character;
1937     redo A;
1938     }
1939     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1940     if ($self->{nc} == 0x002D) { # -
1941     !!!cp (141);
1942     $self->{state} = COMMENT_END_STATE;
1943     !!!next-input-character;
1944     redo A;
1945     } elsif ($self->{nc} == 0x003E) { # >
1946     !!!parse-error (type => 'bogus comment');
1947 wakaba 1.13 if ($self->{in_subset}) {
1948     !!!cp (142.1);
1949     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1950     } else {
1951     !!!cp (142);
1952     $self->{state} = DATA_STATE;
1953     $self->{s_kwd} = '';
1954     }
1955 wakaba 1.1 !!!next-input-character;
1956    
1957     !!!emit ($self->{ct}); # comment
1958    
1959     redo A;
1960     } elsif ($self->{nc} == -1) {
1961     !!!parse-error (type => 'unclosed comment');
1962 wakaba 1.13 if ($self->{in_subset}) {
1963     !!!cp (143.1);
1964     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1965     } else {
1966     !!!cp (143);
1967     $self->{state} = DATA_STATE;
1968     $self->{s_kwd} = '';
1969     }
1970 wakaba 1.1 ## reconsume
1971    
1972     !!!emit ($self->{ct}); # comment
1973    
1974     redo A;
1975     } else {
1976     !!!cp (144);
1977     $self->{ct}->{data} # comment
1978     .= '-' . chr ($self->{nc});
1979     $self->{state} = COMMENT_STATE;
1980     !!!next-input-character;
1981     redo A;
1982     }
1983     } elsif ($self->{state} == COMMENT_STATE) {
1984 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
1985    
1986 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1987     !!!cp (145);
1988     $self->{state} = COMMENT_END_DASH_STATE;
1989     !!!next-input-character;
1990     redo A;
1991     } elsif ($self->{nc} == -1) {
1992     !!!parse-error (type => 'unclosed comment');
1993 wakaba 1.13 if ($self->{in_subset}) {
1994     !!!cp (146.1);
1995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996     } else {
1997     !!!cp (146);
1998     $self->{state} = DATA_STATE;
1999     $self->{s_kwd} = '';
2000     }
2001 wakaba 1.1 ## reconsume
2002    
2003     !!!emit ($self->{ct}); # comment
2004    
2005     redo A;
2006     } else {
2007     !!!cp (147);
2008     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2009     $self->{read_until}->($self->{ct}->{data},
2010     q[-],
2011     length $self->{ct}->{data});
2012    
2013     ## Stay in the state
2014     !!!next-input-character;
2015     redo A;
2016     }
2017     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2018 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2019 wakaba 1.10
2020 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2021     !!!cp (148);
2022     $self->{state} = COMMENT_END_STATE;
2023     !!!next-input-character;
2024     redo A;
2025     } elsif ($self->{nc} == -1) {
2026     !!!parse-error (type => 'unclosed comment');
2027 wakaba 1.13 if ($self->{in_subset}) {
2028     !!!cp (149.1);
2029     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030     } else {
2031     !!!cp (149);
2032     $self->{state} = DATA_STATE;
2033     $self->{s_kwd} = '';
2034     }
2035 wakaba 1.1 ## reconsume
2036    
2037     !!!emit ($self->{ct}); # comment
2038    
2039     redo A;
2040     } else {
2041     !!!cp (150);
2042     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2043     $self->{state} = COMMENT_STATE;
2044     !!!next-input-character;
2045     redo A;
2046     }
2047     } elsif ($self->{state} == COMMENT_END_STATE) {
2048 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2049    
2050 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2051 wakaba 1.13 if ($self->{in_subset}) {
2052     !!!cp (151.1);
2053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2054     } else {
2055     !!!cp (151);
2056     $self->{state} = DATA_STATE;
2057     $self->{s_kwd} = '';
2058     }
2059 wakaba 1.1 !!!next-input-character;
2060    
2061     !!!emit ($self->{ct}); # comment
2062    
2063     redo A;
2064     } elsif ($self->{nc} == 0x002D) { # -
2065     !!!cp (152);
2066 wakaba 1.10 ## XML5: Not a parse error.
2067 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2068     line => $self->{line_prev},
2069     column => $self->{column_prev});
2070     $self->{ct}->{data} .= '-'; # comment
2071     ## Stay in the state
2072     !!!next-input-character;
2073     redo A;
2074     } elsif ($self->{nc} == -1) {
2075     !!!parse-error (type => 'unclosed comment');
2076 wakaba 1.13 if ($self->{in_subset}) {
2077     !!!cp (153.1);
2078     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2079     } else {
2080     !!!cp (153);
2081     $self->{state} = DATA_STATE;
2082     $self->{s_kwd} = '';
2083     }
2084 wakaba 1.1 ## reconsume
2085    
2086     !!!emit ($self->{ct}); # comment
2087    
2088     redo A;
2089     } else {
2090     !!!cp (154);
2091 wakaba 1.10 ## XML5: Not a parse error.
2092 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2093     line => $self->{line_prev},
2094     column => $self->{column_prev});
2095     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2096     $self->{state} = COMMENT_STATE;
2097     !!!next-input-character;
2098     redo A;
2099     }
2100     } elsif ($self->{state} == DOCTYPE_STATE) {
2101     if ($is_space->{$self->{nc}}) {
2102     !!!cp (155);
2103     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2104     !!!next-input-character;
2105     redo A;
2106     } else {
2107     !!!cp (156);
2108 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2109 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2110     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2111     ## reconsume
2112     redo A;
2113     }
2114     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2115 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2116    
2117 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2118     !!!cp (157);
2119     ## Stay in the state
2120     !!!next-input-character;
2121     redo A;
2122     } elsif ($self->{nc} == 0x003E) { # >
2123     !!!cp (158);
2124 wakaba 1.12 ## XML5: No parse error.
2125 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2126     $self->{state} = DATA_STATE;
2127 wakaba 1.5 $self->{s_kwd} = '';
2128 wakaba 1.1 !!!next-input-character;
2129    
2130     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2131    
2132     redo A;
2133     } elsif ($self->{nc} == -1) {
2134     !!!cp (159);
2135     !!!parse-error (type => 'no DOCTYPE name');
2136     $self->{state} = DATA_STATE;
2137 wakaba 1.5 $self->{s_kwd} = '';
2138 wakaba 1.1 ## reconsume
2139    
2140     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141    
2142     redo A;
2143 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2144     !!!cp (159.1);
2145     !!!parse-error (type => 'no DOCTYPE name');
2146     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2147 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2148     $self->{in_subset} = 1;
2149 wakaba 1.12 !!!next-input-character;
2150 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2151 wakaba 1.12 redo A;
2152 wakaba 1.1 } else {
2153     !!!cp (160);
2154     $self->{ct}->{name} = chr $self->{nc};
2155     delete $self->{ct}->{quirks};
2156     $self->{state} = DOCTYPE_NAME_STATE;
2157     !!!next-input-character;
2158     redo A;
2159     }
2160     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2161 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2162    
2163     ## ISSUE: Redundant "First," in the spec.
2164    
2165 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2166     !!!cp (161);
2167     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2168     !!!next-input-character;
2169     redo A;
2170     } elsif ($self->{nc} == 0x003E) { # >
2171     !!!cp (162);
2172     $self->{state} = DATA_STATE;
2173 wakaba 1.5 $self->{s_kwd} = '';
2174 wakaba 1.1 !!!next-input-character;
2175    
2176     !!!emit ($self->{ct}); # DOCTYPE
2177    
2178     redo A;
2179     } elsif ($self->{nc} == -1) {
2180     !!!cp (163);
2181     !!!parse-error (type => 'unclosed DOCTYPE');
2182     $self->{state} = DATA_STATE;
2183 wakaba 1.5 $self->{s_kwd} = '';
2184 wakaba 1.1 ## reconsume
2185    
2186     $self->{ct}->{quirks} = 1;
2187     !!!emit ($self->{ct}); # DOCTYPE
2188    
2189     redo A;
2190 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2191     !!!cp (163.1);
2192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2193 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2194     $self->{in_subset} = 1;
2195 wakaba 1.12 !!!next-input-character;
2196 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2197 wakaba 1.12 redo A;
2198 wakaba 1.1 } else {
2199     !!!cp (164);
2200     $self->{ct}->{name}
2201     .= chr ($self->{nc}); # DOCTYPE
2202     ## Stay in the state
2203     !!!next-input-character;
2204     redo A;
2205     }
2206     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2207 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2208     ## state", but implemented differently.
2209    
2210 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2211     !!!cp (165);
2212     ## Stay in the state
2213     !!!next-input-character;
2214     redo A;
2215     } elsif ($self->{nc} == 0x003E) { # >
2216 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2217     !!!cp (166);
2218     $self->{state} = DATA_STATE;
2219     $self->{s_kwd} = '';
2220     } else {
2221     !!!cp (166.1);
2222     !!!parse-error (type => 'no md def'); ## TODO: type
2223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2224     }
2225    
2226 wakaba 1.1 !!!next-input-character;
2227 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2228 wakaba 1.1 redo A;
2229     } elsif ($self->{nc} == -1) {
2230 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2231     !!!cp (167);
2232     !!!parse-error (type => 'unclosed DOCTYPE');
2233     $self->{state} = DATA_STATE;
2234     $self->{s_kwd} = '';
2235     $self->{ct}->{quirks} = 1;
2236     } else {
2237     !!!cp (167.12);
2238     !!!parse-error (type => 'unclosed md'); ## TODO: type
2239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240     }
2241    
2242     ## Reconsume.
2243     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2244 wakaba 1.1 redo A;
2245     } elsif ($self->{nc} == 0x0050 or # P
2246     $self->{nc} == 0x0070) { # p
2247 wakaba 1.12 !!!cp (167.1);
2248 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2249 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2250 wakaba 1.1 !!!next-input-character;
2251     redo A;
2252     } elsif ($self->{nc} == 0x0053 or # S
2253     $self->{nc} == 0x0073) { # s
2254 wakaba 1.12 !!!cp (167.2);
2255 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2256 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2257     !!!next-input-character;
2258     redo A;
2259 wakaba 1.16 ## TODO: " and ' for ENTITY
2260     } elsif ($self->{is_xml} and
2261     $self->{ct}->{type} == DOCTYPE_TOKEN and
2262     $self->{nc} == 0x005B) { # [
2263 wakaba 1.12 !!!cp (167.3);
2264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2265     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2266 wakaba 1.13 $self->{in_subset} = 1;
2267 wakaba 1.1 !!!next-input-character;
2268 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2269 wakaba 1.1 redo A;
2270     } else {
2271 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2272    
2273     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2274     !!!cp (180);
2275     $self->{ct}->{quirks} = 1;
2276     $self->{state} = BOGUS_DOCTYPE_STATE;
2277     } else {
2278     !!!cp (180.1);
2279     $self->{state} = BOGUS_MD_STATE;
2280     }
2281 wakaba 1.1
2282     !!!next-input-character;
2283     redo A;
2284     }
2285     } elsif ($self->{state} == PUBLIC_STATE) {
2286     ## ASCII case-insensitive
2287     if ($self->{nc} == [
2288     undef,
2289     0x0055, # U
2290     0x0042, # B
2291     0x004C, # L
2292     0x0049, # I
2293 wakaba 1.12 ]->[length $self->{kwd}] or
2294 wakaba 1.1 $self->{nc} == [
2295     undef,
2296     0x0075, # u
2297     0x0062, # b
2298     0x006C, # l
2299     0x0069, # i
2300 wakaba 1.12 ]->[length $self->{kwd}]) {
2301 wakaba 1.1 !!!cp (175);
2302     ## Stay in the state.
2303 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2304 wakaba 1.1 !!!next-input-character;
2305     redo A;
2306 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2307 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2308     $self->{nc} == 0x0063)) { # c
2309 wakaba 1.12 if ($self->{is_xml} and
2310     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2311     !!!cp (168.1);
2312     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2313     text => 'PUBLIC',
2314     line => $self->{line_prev},
2315     column => $self->{column_prev} - 4);
2316     } else {
2317     !!!cp (168);
2318     }
2319 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2320     !!!next-input-character;
2321     redo A;
2322     } else {
2323 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2324 wakaba 1.1 line => $self->{line_prev},
2325 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2327     !!!cp (169);
2328     $self->{ct}->{quirks} = 1;
2329     $self->{state} = BOGUS_DOCTYPE_STATE;
2330     } else {
2331     !!!cp (169.1);
2332     $self->{state} = BOGUS_MD_STATE;
2333     }
2334 wakaba 1.1 ## Reconsume.
2335     redo A;
2336     }
2337     } elsif ($self->{state} == SYSTEM_STATE) {
2338     ## ASCII case-insensitive
2339     if ($self->{nc} == [
2340     undef,
2341     0x0059, # Y
2342     0x0053, # S
2343     0x0054, # T
2344     0x0045, # E
2345 wakaba 1.12 ]->[length $self->{kwd}] or
2346 wakaba 1.1 $self->{nc} == [
2347     undef,
2348     0x0079, # y
2349     0x0073, # s
2350     0x0074, # t
2351     0x0065, # e
2352 wakaba 1.12 ]->[length $self->{kwd}]) {
2353 wakaba 1.1 !!!cp (170);
2354     ## Stay in the state.
2355 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2356 wakaba 1.1 !!!next-input-character;
2357     redo A;
2358 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2359 wakaba 1.1 ($self->{nc} == 0x004D or # M
2360     $self->{nc} == 0x006D)) { # m
2361 wakaba 1.12 if ($self->{is_xml} and
2362     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2363     !!!cp (171.1);
2364     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2365     text => 'SYSTEM',
2366     line => $self->{line_prev},
2367     column => $self->{column_prev} - 4);
2368     } else {
2369     !!!cp (171);
2370     }
2371 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2372     !!!next-input-character;
2373     redo A;
2374     } else {
2375 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2376 wakaba 1.1 line => $self->{line_prev},
2377 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2378 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2379     !!!cp (172);
2380     $self->{ct}->{quirks} = 1;
2381     $self->{state} = BOGUS_DOCTYPE_STATE;
2382     } else {
2383     !!!cp (172.1);
2384     $self->{state} = BOGUS_MD_STATE;
2385     }
2386 wakaba 1.1 ## Reconsume.
2387     redo A;
2388     }
2389     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2390     if ($is_space->{$self->{nc}}) {
2391     !!!cp (181);
2392     ## Stay in the state
2393     !!!next-input-character;
2394     redo A;
2395     } elsif ($self->{nc} eq 0x0022) { # "
2396     !!!cp (182);
2397     $self->{ct}->{pubid} = ''; # DOCTYPE
2398     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2399     !!!next-input-character;
2400     redo A;
2401     } elsif ($self->{nc} eq 0x0027) { # '
2402     !!!cp (183);
2403     $self->{ct}->{pubid} = ''; # DOCTYPE
2404     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2405     !!!next-input-character;
2406     redo A;
2407     } elsif ($self->{nc} eq 0x003E) { # >
2408     !!!parse-error (type => 'no PUBLIC literal');
2409 wakaba 1.16
2410     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2411     !!!cp (184);
2412     $self->{state} = DATA_STATE;
2413     $self->{s_kwd} = '';
2414     $self->{ct}->{quirks} = 1;
2415     } else {
2416     !!!cp (184.1);
2417     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2418     }
2419    
2420 wakaba 1.1 !!!next-input-character;
2421 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2422 wakaba 1.1 redo A;
2423     } elsif ($self->{nc} == -1) {
2424 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2425     !!!cp (185);
2426     !!!parse-error (type => 'unclosed DOCTYPE');
2427     $self->{state} = DATA_STATE;
2428     $self->{s_kwd} = '';
2429     $self->{ct}->{quirks} = 1;
2430     } else {
2431     !!!cp (185.1);
2432     !!!parse-error (type => 'unclosed md'); ## TODO: type
2433     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2434     }
2435    
2436 wakaba 1.1 ## reconsume
2437     !!!emit ($self->{ct}); # DOCTYPE
2438     redo A;
2439 wakaba 1.16 } elsif ($self->{is_xml} and
2440     $self->{ct}->{type} == DOCTYPE_TOKEN and
2441     $self->{nc} == 0x005B) { # [
2442 wakaba 1.12 !!!cp (186.1);
2443     !!!parse-error (type => 'no PUBLIC literal');
2444     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2445     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2446 wakaba 1.13 $self->{in_subset} = 1;
2447 wakaba 1.12 !!!next-input-character;
2448 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2449 wakaba 1.12 redo A;
2450 wakaba 1.1 } else {
2451     !!!parse-error (type => 'string after PUBLIC');
2452    
2453 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2454     !!!cp (186);
2455     $self->{ct}->{quirks} = 1;
2456     $self->{state} = BOGUS_DOCTYPE_STATE;
2457     } else {
2458     !!!cp (186.2);
2459     $self->{state} = BOGUS_MD_STATE;
2460     }
2461    
2462 wakaba 1.1 !!!next-input-character;
2463     redo A;
2464     }
2465     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2466     if ($self->{nc} == 0x0022) { # "
2467     !!!cp (187);
2468     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2469     !!!next-input-character;
2470     redo A;
2471     } elsif ($self->{nc} == 0x003E) { # >
2472     !!!parse-error (type => 'unclosed PUBLIC literal');
2473    
2474 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2475     !!!cp (188);
2476     $self->{state} = DATA_STATE;
2477     $self->{s_kwd} = '';
2478     $self->{ct}->{quirks} = 1;
2479     } else {
2480     !!!cp (188.1);
2481     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2482     }
2483    
2484 wakaba 1.1 !!!next-input-character;
2485 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2486 wakaba 1.1 redo A;
2487     } elsif ($self->{nc} == -1) {
2488     !!!parse-error (type => 'unclosed PUBLIC literal');
2489    
2490 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2491     !!!cp (189);
2492     $self->{state} = DATA_STATE;
2493     $self->{s_kwd} = '';
2494     $self->{ct}->{quirks} = 1;
2495     } else {
2496     !!!cp (189.1);
2497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2498     }
2499    
2500     ## Reconsume.
2501 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2502     redo A;
2503     } else {
2504     !!!cp (190);
2505 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2506 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2507     length $self->{ct}->{pubid});
2508    
2509     ## Stay in the state
2510     !!!next-input-character;
2511     redo A;
2512     }
2513     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2514     if ($self->{nc} == 0x0027) { # '
2515     !!!cp (191);
2516     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2517     !!!next-input-character;
2518     redo A;
2519     } elsif ($self->{nc} == 0x003E) { # >
2520     !!!parse-error (type => 'unclosed PUBLIC literal');
2521    
2522 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2523     !!!cp (192);
2524     $self->{state} = DATA_STATE;
2525     $self->{s_kwd} = '';
2526     $self->{ct}->{quirks} = 1;
2527     } else {
2528     !!!cp (192.1);
2529     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2530     }
2531    
2532 wakaba 1.1 !!!next-input-character;
2533 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2534 wakaba 1.1 redo A;
2535     } elsif ($self->{nc} == -1) {
2536     !!!parse-error (type => 'unclosed PUBLIC literal');
2537    
2538 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2539     !!!cp (193);
2540     $self->{state} = DATA_STATE;
2541     $self->{s_kwd} = '';
2542     $self->{ct}->{quirks} = 1;
2543     } else {
2544     !!!cp (193.1);
2545     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2546     }
2547    
2548 wakaba 1.1 ## reconsume
2549 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2550 wakaba 1.1 redo A;
2551     } else {
2552     !!!cp (194);
2553 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2554 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2555     length $self->{ct}->{pubid});
2556    
2557     ## Stay in the state
2558     !!!next-input-character;
2559     redo A;
2560     }
2561     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2562     if ($is_space->{$self->{nc}}) {
2563     !!!cp (195);
2564     ## Stay in the state
2565     !!!next-input-character;
2566     redo A;
2567     } elsif ($self->{nc} == 0x0022) { # "
2568     !!!cp (196);
2569 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2570 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2571     !!!next-input-character;
2572     redo A;
2573     } elsif ($self->{nc} == 0x0027) { # '
2574     !!!cp (197);
2575 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2577     !!!next-input-character;
2578     redo A;
2579     } elsif ($self->{nc} == 0x003E) { # >
2580 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2581     if ($self->{is_xml}) {
2582     !!!cp (198.1);
2583     !!!parse-error (type => 'no SYSTEM literal');
2584     } else {
2585     !!!cp (198);
2586     }
2587     $self->{state} = DATA_STATE;
2588     $self->{s_kwd} = '';
2589 wakaba 1.12 } else {
2590 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2591     !!!cp (198.2);
2592     } else {
2593     !!!cp (198.3);
2594     !!!parse-error (type => 'no SYSTEM literal');
2595     }
2596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597 wakaba 1.12 }
2598 wakaba 1.16
2599 wakaba 1.1 !!!next-input-character;
2600 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2601 wakaba 1.1 redo A;
2602     } elsif ($self->{nc} == -1) {
2603 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2604     !!!cp (199);
2605     !!!parse-error (type => 'unclosed DOCTYPE');
2606    
2607     $self->{state} = DATA_STATE;
2608     $self->{s_kwd} = '';
2609     $self->{ct}->{quirks} = 1;
2610     } else {
2611     !!!parse-error (type => 'unclosed md'); ## TODO: type
2612     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2613     }
2614    
2615 wakaba 1.1 ## reconsume
2616 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2617 wakaba 1.1 redo A;
2618 wakaba 1.16 } elsif ($self->{is_xml} and
2619     $self->{ct}->{type} == DOCTYPE_TOKEN and
2620     $self->{nc} == 0x005B) { # [
2621 wakaba 1.12 !!!cp (200.1);
2622     !!!parse-error (type => 'no SYSTEM literal');
2623     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2624     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2625 wakaba 1.13 $self->{in_subset} = 1;
2626 wakaba 1.12 !!!next-input-character;
2627 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2628 wakaba 1.12 redo A;
2629 wakaba 1.1 } else {
2630     !!!parse-error (type => 'string after PUBLIC literal');
2631    
2632 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2633     !!!cp (200);
2634     $self->{ct}->{quirks} = 1;
2635     $self->{state} = BOGUS_DOCTYPE_STATE;
2636     } else {
2637     !!!cp (200.2);
2638     $self->{state} = BOGUS_MD_STATE;
2639     }
2640    
2641 wakaba 1.1 !!!next-input-character;
2642     redo A;
2643     }
2644     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2645     if ($is_space->{$self->{nc}}) {
2646     !!!cp (201);
2647     ## Stay in the state
2648     !!!next-input-character;
2649     redo A;
2650     } elsif ($self->{nc} == 0x0022) { # "
2651     !!!cp (202);
2652     $self->{ct}->{sysid} = ''; # DOCTYPE
2653     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2654     !!!next-input-character;
2655     redo A;
2656     } elsif ($self->{nc} == 0x0027) { # '
2657     !!!cp (203);
2658     $self->{ct}->{sysid} = ''; # DOCTYPE
2659     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2660     !!!next-input-character;
2661     redo A;
2662     } elsif ($self->{nc} == 0x003E) { # >
2663     !!!parse-error (type => 'no SYSTEM literal');
2664     !!!next-input-character;
2665    
2666 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2667     !!!cp (204);
2668     $self->{state} = DATA_STATE;
2669     $self->{s_kwd} = '';
2670     $self->{ct}->{quirks} = 1;
2671     } else {
2672     !!!cp (204.1);
2673     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2674     }
2675 wakaba 1.1
2676 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2677 wakaba 1.1 redo A;
2678     } elsif ($self->{nc} == -1) {
2679 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2680     !!!cp (205);
2681     !!!parse-error (type => 'unclosed DOCTYPE');
2682     $self->{state} = DATA_STATE;
2683     $self->{s_kwd} = '';
2684     $self->{ct}->{quirks} = 1;
2685     } else {
2686     !!!cp (205.1);
2687     !!!parse-error (type => 'unclosed md'); ## TODO: type
2688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2689     }
2690    
2691 wakaba 1.1 ## reconsume
2692 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2693 wakaba 1.1 redo A;
2694 wakaba 1.16 } elsif ($self->{is_xml} and
2695     $self->{ct}->{type} == DOCTYPE_TOKEN and
2696     $self->{nc} == 0x005B) { # [
2697 wakaba 1.12 !!!cp (206.1);
2698     !!!parse-error (type => 'no SYSTEM literal');
2699    
2700     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2701     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2702 wakaba 1.13 $self->{in_subset} = 1;
2703 wakaba 1.12 !!!next-input-character;
2704 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2705 wakaba 1.12 redo A;
2706 wakaba 1.1 } else {
2707     !!!parse-error (type => 'string after SYSTEM');
2708    
2709 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2710     !!!cp (206);
2711     $self->{ct}->{quirks} = 1;
2712     $self->{state} = BOGUS_DOCTYPE_STATE;
2713     } else {
2714     !!!cp (206.2);
2715     $self->{state} = BOGUS_MD_STATE;
2716     }
2717    
2718 wakaba 1.1 !!!next-input-character;
2719     redo A;
2720     }
2721     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2722     if ($self->{nc} == 0x0022) { # "
2723     !!!cp (207);
2724     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2725     !!!next-input-character;
2726     redo A;
2727 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2728 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2729    
2730 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2731     !!!cp (208);
2732     $self->{state} = DATA_STATE;
2733     $self->{s_kwd} = '';
2734     $self->{ct}->{quirks} = 1;
2735     } else {
2736     !!!cp (208.1);
2737     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2738     }
2739    
2740 wakaba 1.1 !!!next-input-character;
2741 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2742 wakaba 1.1 redo A;
2743     } elsif ($self->{nc} == -1) {
2744     !!!parse-error (type => 'unclosed SYSTEM literal');
2745    
2746 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2747     !!!cp (209);
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     $self->{ct}->{quirks} = 1;
2751     } else {
2752     !!!cp (209.1);
2753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2754     }
2755    
2756 wakaba 1.1 ## reconsume
2757 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2758 wakaba 1.1 redo A;
2759     } else {
2760     !!!cp (210);
2761 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2762 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2763     length $self->{ct}->{sysid});
2764    
2765     ## Stay in the state
2766     !!!next-input-character;
2767     redo A;
2768     }
2769     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2770     if ($self->{nc} == 0x0027) { # '
2771     !!!cp (211);
2772     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2773     !!!next-input-character;
2774     redo A;
2775 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2776 wakaba 1.1 !!!cp (212);
2777     !!!parse-error (type => 'unclosed SYSTEM literal');
2778    
2779     $self->{state} = DATA_STATE;
2780 wakaba 1.5 $self->{s_kwd} = '';
2781 wakaba 1.1 !!!next-input-character;
2782    
2783     $self->{ct}->{quirks} = 1;
2784     !!!emit ($self->{ct}); # DOCTYPE
2785    
2786     redo A;
2787     } elsif ($self->{nc} == -1) {
2788     !!!parse-error (type => 'unclosed SYSTEM literal');
2789    
2790 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2791     !!!cp (213);
2792     $self->{state} = DATA_STATE;
2793     $self->{s_kwd} = '';
2794     $self->{ct}->{quirks} = 1;
2795     } else {
2796     !!!cp (213.1);
2797     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2798     }
2799    
2800 wakaba 1.1 ## reconsume
2801 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2802 wakaba 1.1 redo A;
2803     } else {
2804     !!!cp (214);
2805 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2806 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2807     length $self->{ct}->{sysid});
2808    
2809     ## Stay in the state
2810     !!!next-input-character;
2811     redo A;
2812     }
2813     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2814     if ($is_space->{$self->{nc}}) {
2815     !!!cp (215);
2816     ## Stay in the state
2817     !!!next-input-character;
2818     redo A;
2819     } elsif ($self->{nc} == 0x003E) { # >
2820 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2821     !!!cp (216);
2822     $self->{state} = DATA_STATE;
2823     $self->{s_kwd} = '';
2824     } else {
2825     !!!cp (216.1);
2826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2827     }
2828    
2829 wakaba 1.1 !!!next-input-character;
2830 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2831 wakaba 1.1 redo A;
2832 wakaba 1.16 ## TODO: "NDATA"
2833 wakaba 1.1 } elsif ($self->{nc} == -1) {
2834 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2835     !!!cp (217);
2836     !!!parse-error (type => 'unclosed DOCTYPE');
2837     $self->{state} = DATA_STATE;
2838     $self->{s_kwd} = '';
2839     $self->{ct}->{quirks} = 1;
2840     } else {
2841     !!!cp (217.1);
2842     !!!parse-error (type => 'unclosed md'); ## TODO: type
2843     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2844     }
2845    
2846 wakaba 1.1 ## reconsume
2847 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2848 wakaba 1.1 redo A;
2849 wakaba 1.16 } elsif ($self->{is_xml} and
2850     $self->{ct}->{type} == DOCTYPE_TOKEN and
2851     $self->{nc} == 0x005B) { # [
2852 wakaba 1.12 !!!cp (218.1);
2853     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2854     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2855 wakaba 1.13 $self->{in_subset} = 1;
2856 wakaba 1.12 !!!next-input-character;
2857 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2858 wakaba 1.12 redo A;
2859 wakaba 1.1 } else {
2860     !!!parse-error (type => 'string after SYSTEM literal');
2861    
2862 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2863     !!!cp (218);
2864     #$self->{ct}->{quirks} = 1;
2865     $self->{state} = BOGUS_DOCTYPE_STATE;
2866     } else {
2867     !!!cp (218.2);
2868     $self->{state} = BOGUS_MD_STATE;
2869     }
2870    
2871 wakaba 1.1 !!!next-input-character;
2872     redo A;
2873     }
2874     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2875     if ($self->{nc} == 0x003E) { # >
2876     !!!cp (219);
2877     $self->{state} = DATA_STATE;
2878 wakaba 1.5 $self->{s_kwd} = '';
2879 wakaba 1.1 !!!next-input-character;
2880    
2881     !!!emit ($self->{ct}); # DOCTYPE
2882    
2883     redo A;
2884 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2885 wakaba 1.13 !!!cp (220.1);
2886     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2887     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2888     $self->{in_subset} = 1;
2889     !!!next-input-character;
2890     !!!emit ($self->{ct}); # DOCTYPE
2891     redo A;
2892 wakaba 1.1 } elsif ($self->{nc} == -1) {
2893     !!!cp (220);
2894     $self->{state} = DATA_STATE;
2895 wakaba 1.5 $self->{s_kwd} = '';
2896 wakaba 1.1 ## reconsume
2897    
2898     !!!emit ($self->{ct}); # DOCTYPE
2899    
2900     redo A;
2901     } else {
2902     !!!cp (221);
2903     my $s = '';
2904 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2905 wakaba 1.1
2906     ## Stay in the state
2907     !!!next-input-character;
2908     redo A;
2909     }
2910     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2911     ## NOTE: "CDATA section state" in the state is jointly implemented
2912     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2913     ## and |CDATA_SECTION_MSE2_STATE|.
2914 wakaba 1.10
2915     ## XML5: "CDATA state".
2916 wakaba 1.1
2917     if ($self->{nc} == 0x005D) { # ]
2918     !!!cp (221.1);
2919     $self->{state} = CDATA_SECTION_MSE1_STATE;
2920     !!!next-input-character;
2921     redo A;
2922     } elsif ($self->{nc} == -1) {
2923 wakaba 1.6 if ($self->{is_xml}) {
2924 wakaba 1.8 !!!cp (221.11);
2925 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2926 wakaba 1.8 } else {
2927     !!!cp (221.12);
2928 wakaba 1.6 }
2929    
2930 wakaba 1.1 $self->{state} = DATA_STATE;
2931 wakaba 1.5 $self->{s_kwd} = '';
2932 wakaba 1.10 ## Reconsume.
2933 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2934     !!!cp (221.2);
2935     !!!emit ($self->{ct}); # character
2936     } else {
2937     !!!cp (221.3);
2938     ## No token to emit. $self->{ct} is discarded.
2939     }
2940     redo A;
2941     } else {
2942     !!!cp (221.4);
2943     $self->{ct}->{data} .= chr $self->{nc};
2944     $self->{read_until}->($self->{ct}->{data},
2945     q<]>,
2946     length $self->{ct}->{data});
2947    
2948     ## Stay in the state.
2949     !!!next-input-character;
2950     redo A;
2951     }
2952    
2953     ## ISSUE: "text tokens" in spec.
2954     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2955 wakaba 1.10 ## XML5: "CDATA bracket state".
2956    
2957 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2958     !!!cp (221.5);
2959     $self->{state} = CDATA_SECTION_MSE2_STATE;
2960     !!!next-input-character;
2961     redo A;
2962     } else {
2963     !!!cp (221.6);
2964 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2965 wakaba 1.1 $self->{ct}->{data} .= ']';
2966 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2967 wakaba 1.1 ## Reconsume.
2968     redo A;
2969     }
2970     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2971 wakaba 1.10 ## XML5: "CDATA end state".
2972    
2973 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2974     $self->{state} = DATA_STATE;
2975 wakaba 1.5 $self->{s_kwd} = '';
2976 wakaba 1.1 !!!next-input-character;
2977     if (length $self->{ct}->{data}) { # character
2978     !!!cp (221.7);
2979     !!!emit ($self->{ct}); # character
2980     } else {
2981     !!!cp (221.8);
2982     ## No token to emit. $self->{ct} is discarded.
2983     }
2984     redo A;
2985     } elsif ($self->{nc} == 0x005D) { # ]
2986     !!!cp (221.9); # character
2987     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2988     ## Stay in the state.
2989     !!!next-input-character;
2990     redo A;
2991     } else {
2992     !!!cp (221.11);
2993     $self->{ct}->{data} .= ']]'; # character
2994     $self->{state} = CDATA_SECTION_STATE;
2995 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2996 wakaba 1.1 redo A;
2997     }
2998     } elsif ($self->{state} == ENTITY_STATE) {
2999     if ($is_space->{$self->{nc}} or
3000     {
3001     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3002     $self->{entity_add} => 1,
3003     }->{$self->{nc}}) {
3004     !!!cp (1001);
3005     ## Don't consume
3006     ## No error
3007     ## Return nothing.
3008     #
3009     } elsif ($self->{nc} == 0x0023) { # #
3010     !!!cp (999);
3011     $self->{state} = ENTITY_HASH_STATE;
3012 wakaba 1.12 $self->{kwd} = '#';
3013 wakaba 1.1 !!!next-input-character;
3014     redo A;
3015     } elsif ((0x0041 <= $self->{nc} and
3016     $self->{nc} <= 0x005A) or # A..Z
3017     (0x0061 <= $self->{nc} and
3018     $self->{nc} <= 0x007A)) { # a..z
3019     !!!cp (998);
3020     require Whatpm::_NamedEntityList;
3021     $self->{state} = ENTITY_NAME_STATE;
3022 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3023     $self->{entity__value} = $self->{kwd};
3024 wakaba 1.1 $self->{entity__match} = 0;
3025     !!!next-input-character;
3026     redo A;
3027     } else {
3028     !!!cp (1027);
3029     !!!parse-error (type => 'bare ero');
3030     ## Return nothing.
3031     #
3032     }
3033    
3034     ## NOTE: No character is consumed by the "consume a character
3035     ## reference" algorithm. In other word, there is an "&" character
3036     ## that does not introduce a character reference, which would be
3037     ## appended to the parent element or the attribute value in later
3038     ## process of the tokenizer.
3039    
3040     if ($self->{prev_state} == DATA_STATE) {
3041     !!!cp (997);
3042     $self->{state} = $self->{prev_state};
3043 wakaba 1.5 $self->{s_kwd} = '';
3044 wakaba 1.1 ## Reconsume.
3045     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3046     line => $self->{line_prev},
3047     column => $self->{column_prev},
3048     });
3049     redo A;
3050     } else {
3051     !!!cp (996);
3052     $self->{ca}->{value} .= '&';
3053     $self->{state} = $self->{prev_state};
3054 wakaba 1.5 $self->{s_kwd} = '';
3055 wakaba 1.1 ## Reconsume.
3056     redo A;
3057     }
3058     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3059     if ($self->{nc} == 0x0078 or # x
3060     $self->{nc} == 0x0058) { # X
3061     !!!cp (995);
3062     $self->{state} = HEXREF_X_STATE;
3063 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3064 wakaba 1.1 !!!next-input-character;
3065     redo A;
3066     } elsif (0x0030 <= $self->{nc} and
3067     $self->{nc} <= 0x0039) { # 0..9
3068     !!!cp (994);
3069     $self->{state} = NCR_NUM_STATE;
3070 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3071 wakaba 1.1 !!!next-input-character;
3072     redo A;
3073     } else {
3074     !!!parse-error (type => 'bare nero',
3075     line => $self->{line_prev},
3076     column => $self->{column_prev} - 1);
3077    
3078     ## NOTE: According to the spec algorithm, nothing is returned,
3079     ## and then "&#" is appended to the parent element or the attribute
3080     ## value in the later processing.
3081    
3082     if ($self->{prev_state} == DATA_STATE) {
3083     !!!cp (1019);
3084     $self->{state} = $self->{prev_state};
3085 wakaba 1.5 $self->{s_kwd} = '';
3086 wakaba 1.1 ## Reconsume.
3087     !!!emit ({type => CHARACTER_TOKEN,
3088     data => '&#',
3089     line => $self->{line_prev},
3090     column => $self->{column_prev} - 1,
3091     });
3092     redo A;
3093     } else {
3094     !!!cp (993);
3095     $self->{ca}->{value} .= '&#';
3096     $self->{state} = $self->{prev_state};
3097 wakaba 1.5 $self->{s_kwd} = '';
3098 wakaba 1.1 ## Reconsume.
3099     redo A;
3100     }
3101     }
3102     } elsif ($self->{state} == NCR_NUM_STATE) {
3103     if (0x0030 <= $self->{nc} and
3104     $self->{nc} <= 0x0039) { # 0..9
3105     !!!cp (1012);
3106 wakaba 1.12 $self->{kwd} *= 10;
3107     $self->{kwd} += $self->{nc} - 0x0030;
3108 wakaba 1.1
3109     ## Stay in the state.
3110     !!!next-input-character;
3111     redo A;
3112     } elsif ($self->{nc} == 0x003B) { # ;
3113     !!!cp (1013);
3114     !!!next-input-character;
3115     #
3116     } else {
3117     !!!cp (1014);
3118     !!!parse-error (type => 'no refc');
3119     ## Reconsume.
3120     #
3121     }
3122    
3123 wakaba 1.12 my $code = $self->{kwd};
3124 wakaba 1.1 my $l = $self->{line_prev};
3125     my $c = $self->{column_prev};
3126     if ($charref_map->{$code}) {
3127     !!!cp (1015);
3128     !!!parse-error (type => 'invalid character reference',
3129     text => (sprintf 'U+%04X', $code),
3130     line => $l, column => $c);
3131     $code = $charref_map->{$code};
3132     } elsif ($code > 0x10FFFF) {
3133     !!!cp (1016);
3134     !!!parse-error (type => 'invalid character reference',
3135     text => (sprintf 'U-%08X', $code),
3136     line => $l, column => $c);
3137     $code = 0xFFFD;
3138     }
3139    
3140     if ($self->{prev_state} == DATA_STATE) {
3141     !!!cp (992);
3142     $self->{state} = $self->{prev_state};
3143 wakaba 1.5 $self->{s_kwd} = '';
3144 wakaba 1.1 ## Reconsume.
3145     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3146 wakaba 1.7 has_reference => 1,
3147 wakaba 1.1 line => $l, column => $c,
3148     });
3149     redo A;
3150     } else {
3151     !!!cp (991);
3152     $self->{ca}->{value} .= chr $code;
3153     $self->{ca}->{has_reference} = 1;
3154     $self->{state} = $self->{prev_state};
3155 wakaba 1.5 $self->{s_kwd} = '';
3156 wakaba 1.1 ## Reconsume.
3157     redo A;
3158     }
3159     } elsif ($self->{state} == HEXREF_X_STATE) {
3160     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3161     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3162     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3163     # 0..9, A..F, a..f
3164     !!!cp (990);
3165     $self->{state} = HEXREF_HEX_STATE;
3166 wakaba 1.12 $self->{kwd} = 0;
3167 wakaba 1.1 ## Reconsume.
3168     redo A;
3169     } else {
3170     !!!parse-error (type => 'bare hcro',
3171     line => $self->{line_prev},
3172     column => $self->{column_prev} - 2);
3173    
3174     ## NOTE: According to the spec algorithm, nothing is returned,
3175     ## and then "&#" followed by "X" or "x" is appended to the parent
3176     ## element or the attribute value in the later processing.
3177    
3178     if ($self->{prev_state} == DATA_STATE) {
3179     !!!cp (1005);
3180     $self->{state} = $self->{prev_state};
3181 wakaba 1.5 $self->{s_kwd} = '';
3182 wakaba 1.1 ## Reconsume.
3183     !!!emit ({type => CHARACTER_TOKEN,
3184 wakaba 1.12 data => '&' . $self->{kwd},
3185 wakaba 1.1 line => $self->{line_prev},
3186 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3187 wakaba 1.1 });
3188     redo A;
3189     } else {
3190     !!!cp (989);
3191 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3192 wakaba 1.1 $self->{state} = $self->{prev_state};
3193 wakaba 1.5 $self->{s_kwd} = '';
3194 wakaba 1.1 ## Reconsume.
3195     redo A;
3196     }
3197     }
3198     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3199     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3200     # 0..9
3201     !!!cp (1002);
3202 wakaba 1.12 $self->{kwd} *= 0x10;
3203     $self->{kwd} += $self->{nc} - 0x0030;
3204 wakaba 1.1 ## Stay in the state.
3205     !!!next-input-character;
3206     redo A;
3207     } elsif (0x0061 <= $self->{nc} and
3208     $self->{nc} <= 0x0066) { # a..f
3209     !!!cp (1003);
3210 wakaba 1.12 $self->{kwd} *= 0x10;
3211     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3212 wakaba 1.1 ## Stay in the state.
3213     !!!next-input-character;
3214     redo A;
3215     } elsif (0x0041 <= $self->{nc} and
3216     $self->{nc} <= 0x0046) { # A..F
3217     !!!cp (1004);
3218 wakaba 1.12 $self->{kwd} *= 0x10;
3219     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3220 wakaba 1.1 ## Stay in the state.
3221     !!!next-input-character;
3222     redo A;
3223     } elsif ($self->{nc} == 0x003B) { # ;
3224     !!!cp (1006);
3225     !!!next-input-character;
3226     #
3227     } else {
3228     !!!cp (1007);
3229     !!!parse-error (type => 'no refc',
3230     line => $self->{line},
3231     column => $self->{column});
3232     ## Reconsume.
3233     #
3234     }
3235    
3236 wakaba 1.12 my $code = $self->{kwd};
3237 wakaba 1.1 my $l = $self->{line_prev};
3238     my $c = $self->{column_prev};
3239     if ($charref_map->{$code}) {
3240     !!!cp (1008);
3241     !!!parse-error (type => 'invalid character reference',
3242     text => (sprintf 'U+%04X', $code),
3243     line => $l, column => $c);
3244     $code = $charref_map->{$code};
3245     } elsif ($code > 0x10FFFF) {
3246     !!!cp (1009);
3247     !!!parse-error (type => 'invalid character reference',
3248     text => (sprintf 'U-%08X', $code),
3249     line => $l, column => $c);
3250     $code = 0xFFFD;
3251     }
3252    
3253     if ($self->{prev_state} == DATA_STATE) {
3254     !!!cp (988);
3255     $self->{state} = $self->{prev_state};
3256 wakaba 1.5 $self->{s_kwd} = '';
3257 wakaba 1.1 ## Reconsume.
3258     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3259 wakaba 1.7 has_reference => 1,
3260 wakaba 1.1 line => $l, column => $c,
3261     });
3262     redo A;
3263     } else {
3264     !!!cp (987);
3265     $self->{ca}->{value} .= chr $code;
3266     $self->{ca}->{has_reference} = 1;
3267     $self->{state} = $self->{prev_state};
3268 wakaba 1.5 $self->{s_kwd} = '';
3269 wakaba 1.1 ## Reconsume.
3270     redo A;
3271     }
3272     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3273 wakaba 1.12 if (length $self->{kwd} < 30 and
3274 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3275     ((0x0041 <= $self->{nc} and # a
3276     $self->{nc} <= 0x005A) or # x
3277     (0x0061 <= $self->{nc} and # a
3278     $self->{nc} <= 0x007A) or # z
3279     (0x0030 <= $self->{nc} and # 0
3280     $self->{nc} <= 0x0039) or # 9
3281     $self->{nc} == 0x003B)) { # ;
3282     our $EntityChar;
3283 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3284     if (defined $EntityChar->{$self->{kwd}}) {
3285 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3286     !!!cp (1020);
3287 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3288 wakaba 1.1 $self->{entity__match} = 1;
3289     !!!next-input-character;
3290     #
3291     } else {
3292     !!!cp (1021);
3293 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3294 wakaba 1.1 $self->{entity__match} = -1;
3295     ## Stay in the state.
3296     !!!next-input-character;
3297     redo A;
3298     }
3299     } else {
3300     !!!cp (1022);
3301     $self->{entity__value} .= chr $self->{nc};
3302     $self->{entity__match} *= 2;
3303     ## Stay in the state.
3304     !!!next-input-character;
3305     redo A;
3306     }
3307     }
3308    
3309     my $data;
3310     my $has_ref;
3311     if ($self->{entity__match} > 0) {
3312     !!!cp (1023);
3313     $data = $self->{entity__value};
3314     $has_ref = 1;
3315     #
3316     } elsif ($self->{entity__match} < 0) {
3317     !!!parse-error (type => 'no refc');
3318     if ($self->{prev_state} != DATA_STATE and # in attribute
3319     $self->{entity__match} < -1) {
3320     !!!cp (1024);
3321 wakaba 1.12 $data = '&' . $self->{kwd};
3322 wakaba 1.1 #
3323     } else {
3324     !!!cp (1025);
3325     $data = $self->{entity__value};
3326     $has_ref = 1;
3327     #
3328     }
3329     } else {
3330     !!!cp (1026);
3331     !!!parse-error (type => 'bare ero',
3332     line => $self->{line_prev},
3333 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3334     $data = '&' . $self->{kwd};
3335 wakaba 1.1 #
3336     }
3337    
3338     ## NOTE: In these cases, when a character reference is found,
3339     ## it is consumed and a character token is returned, or, otherwise,
3340     ## nothing is consumed and returned, according to the spec algorithm.
3341     ## In this implementation, anything that has been examined by the
3342     ## tokenizer is appended to the parent element or the attribute value
3343     ## as string, either literal string when no character reference or
3344     ## entity-replaced string otherwise, in this stage, since any characters
3345     ## that would not be consumed are appended in the data state or in an
3346     ## appropriate attribute value state anyway.
3347    
3348     if ($self->{prev_state} == DATA_STATE) {
3349     !!!cp (986);
3350     $self->{state} = $self->{prev_state};
3351 wakaba 1.5 $self->{s_kwd} = '';
3352 wakaba 1.1 ## Reconsume.
3353     !!!emit ({type => CHARACTER_TOKEN,
3354     data => $data,
3355 wakaba 1.7 has_reference => $has_ref,
3356 wakaba 1.1 line => $self->{line_prev},
3357 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3358 wakaba 1.1 });
3359     redo A;
3360     } else {
3361     !!!cp (985);
3362     $self->{ca}->{value} .= $data;
3363     $self->{ca}->{has_reference} = 1 if $has_ref;
3364     $self->{state} = $self->{prev_state};
3365 wakaba 1.5 $self->{s_kwd} = '';
3366 wakaba 1.1 ## Reconsume.
3367     redo A;
3368     }
3369 wakaba 1.8
3370     ## XML-only states
3371    
3372     } elsif ($self->{state} == PI_STATE) {
3373 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3374    
3375 wakaba 1.8 if ($is_space->{$self->{nc}} or
3376 wakaba 1.14 $self->{nc} == 0x003F or # ?
3377 wakaba 1.8 $self->{nc} == -1) {
3378 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3379     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3380     ## "DOCTYPE pi state": Parse error, switch to the "data
3381     ## state".
3382 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3383     line => $self->{line_prev},
3384     column => $self->{column_prev}
3385     - 1 * ($self->{nc} != -1));
3386     $self->{state} = BOGUS_COMMENT_STATE;
3387     ## Reconsume.
3388     $self->{ct} = {type => COMMENT_TOKEN,
3389     data => '?',
3390     line => $self->{line_prev},
3391     column => $self->{column_prev}
3392     - 1 * ($self->{nc} != -1),
3393     };
3394     redo A;
3395     } else {
3396 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3397 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3398     target => chr $self->{nc},
3399     data => '',
3400     line => $self->{line_prev},
3401     column => $self->{column_prev} - 1,
3402     };
3403     $self->{state} = PI_TARGET_STATE;
3404     !!!next-input-character;
3405     redo A;
3406     }
3407     } elsif ($self->{state} == PI_TARGET_STATE) {
3408     if ($is_space->{$self->{nc}}) {
3409     $self->{state} = PI_TARGET_AFTER_STATE;
3410     !!!next-input-character;
3411     redo A;
3412     } elsif ($self->{nc} == -1) {
3413     !!!parse-error (type => 'no pic'); ## TODO: type
3414 wakaba 1.13 if ($self->{in_subset}) {
3415     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3416     } else {
3417     $self->{state} = DATA_STATE;
3418     $self->{s_kwd} = '';
3419     }
3420 wakaba 1.8 ## Reconsume.
3421     !!!emit ($self->{ct}); # pi
3422     redo A;
3423     } elsif ($self->{nc} == 0x003F) { # ?
3424     $self->{state} = PI_AFTER_STATE;
3425     !!!next-input-character;
3426     redo A;
3427     } else {
3428     ## XML5: typo ("tag name" -> "target")
3429     $self->{ct}->{target} .= chr $self->{nc}; # pi
3430     !!!next-input-character;
3431     redo A;
3432     }
3433     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3434     if ($is_space->{$self->{nc}}) {
3435     ## Stay in the state.
3436     !!!next-input-character;
3437     redo A;
3438     } else {
3439     $self->{state} = PI_DATA_STATE;
3440     ## Reprocess.
3441     redo A;
3442     }
3443     } elsif ($self->{state} == PI_DATA_STATE) {
3444     if ($self->{nc} == 0x003F) { # ?
3445     $self->{state} = PI_DATA_AFTER_STATE;
3446     !!!next-input-character;
3447     redo A;
3448     } elsif ($self->{nc} == -1) {
3449     !!!parse-error (type => 'no pic'); ## TODO: type
3450 wakaba 1.13 if ($self->{in_subset}) {
3451 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3452 wakaba 1.13 } else {
3453     $self->{state} = DATA_STATE;
3454     $self->{s_kwd} = '';
3455     }
3456 wakaba 1.8 ## Reprocess.
3457     !!!emit ($self->{ct}); # pi
3458     redo A;
3459     } else {
3460     $self->{ct}->{data} .= chr $self->{nc}; # pi
3461     $self->{read_until}->($self->{ct}->{data}, q[?],
3462     length $self->{ct}->{data});
3463     ## Stay in the state.
3464     !!!next-input-character;
3465     ## Reprocess.
3466     redo A;
3467     }
3468     } elsif ($self->{state} == PI_AFTER_STATE) {
3469 wakaba 1.14 ## XML5: Part of "Pi after state".
3470    
3471 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3472 wakaba 1.13 if ($self->{in_subset}) {
3473     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3474     } else {
3475     $self->{state} = DATA_STATE;
3476     $self->{s_kwd} = '';
3477     }
3478 wakaba 1.8 !!!next-input-character;
3479     !!!emit ($self->{ct}); # pi
3480     redo A;
3481     } elsif ($self->{nc} == 0x003F) { # ?
3482     !!!parse-error (type => 'no s after target', ## TODO: type
3483     line => $self->{line_prev},
3484     column => $self->{column_prev}); ## XML5: no error
3485     $self->{ct}->{data} .= '?';
3486     $self->{state} = PI_DATA_AFTER_STATE;
3487     !!!next-input-character;
3488     redo A;
3489     } else {
3490     !!!parse-error (type => 'no s after target', ## TODO: type
3491     line => $self->{line_prev},
3492     column => $self->{column_prev}
3493     + 1 * ($self->{nc} == -1)); ## XML5: no error
3494     $self->{ct}->{data} .= '?'; ## XML5: not appended
3495     $self->{state} = PI_DATA_STATE;
3496     ## Reprocess.
3497     redo A;
3498     }
3499     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3500 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3501    
3502 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3503 wakaba 1.13 if ($self->{in_subset}) {
3504     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3505     } else {
3506     $self->{state} = DATA_STATE;
3507     $self->{s_kwd} = '';
3508     }
3509 wakaba 1.8 !!!next-input-character;
3510     !!!emit ($self->{ct}); # pi
3511     redo A;
3512     } elsif ($self->{nc} == 0x003F) { # ?
3513     $self->{ct}->{data} .= '?';
3514     ## Stay in the state.
3515     !!!next-input-character;
3516     redo A;
3517     } else {
3518     $self->{ct}->{data} .= '?'; ## XML5: not appended
3519     $self->{state} = PI_DATA_STATE;
3520     ## Reprocess.
3521     redo A;
3522     }
3523 wakaba 1.12
3524     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3525     if ($self->{nc} == 0x003C) { # <
3526 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3527 wakaba 1.12 !!!next-input-character;
3528     redo A;
3529     } elsif ($self->{nc} == 0x0025) { # %
3530     ## XML5: Not defined yet.
3531    
3532     ## TODO:
3533     !!!next-input-character;
3534     redo A;
3535     } elsif ($self->{nc} == 0x005D) { # ]
3536 wakaba 1.13 delete $self->{in_subset};
3537 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3538     !!!next-input-character;
3539     redo A;
3540     } elsif ($is_space->{$self->{nc}}) {
3541     ## Stay in the state.
3542     !!!next-input-character;
3543     redo A;
3544     } elsif ($self->{nc} == -1) {
3545     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3546 wakaba 1.13 delete $self->{in_subset};
3547 wakaba 1.12 $self->{state} = DATA_STATE;
3548     $self->{s_kwd} = '';
3549     ## Reconsume.
3550 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3551 wakaba 1.12 redo A;
3552     } else {
3553     unless ($self->{internal_subset_tainted}) {
3554     ## XML5: No parse error.
3555     !!!parse-error (type => 'string in internal subset');
3556     $self->{internal_subset_tainted} = 1;
3557     }
3558     ## Stay in the state.
3559     !!!next-input-character;
3560     redo A;
3561     }
3562     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3563     if ($self->{nc} == 0x003E) { # >
3564     $self->{state} = DATA_STATE;
3565     $self->{s_kwd} = '';
3566     !!!next-input-character;
3567 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3568 wakaba 1.12 redo A;
3569     } elsif ($self->{nc} == -1) {
3570     !!!parse-error (type => 'unclosed DOCTYPE');
3571     $self->{state} = DATA_STATE;
3572     $self->{s_kwd} = '';
3573     ## Reconsume.
3574 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3575 wakaba 1.12 redo A;
3576     } else {
3577     ## XML5: No parse error and stay in the state.
3578     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3579    
3580 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3581     !!!next-input-character;
3582     redo A;
3583     }
3584     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3585     if ($self->{nc} == 0x003E) { # >
3586     $self->{state} = DATA_STATE;
3587     $self->{s_kwd} = '';
3588     !!!next-input-character;
3589     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3590     redo A;
3591     } elsif ($self->{nc} == -1) {
3592     $self->{state} = DATA_STATE;
3593     $self->{s_kwd} = '';
3594     ## Reconsume.
3595     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3596     redo A;
3597     } else {
3598     ## Stay in the state.
3599     !!!next-input-character;
3600     redo A;
3601     }
3602     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3603     if ($self->{nc} == 0x0021) { # !
3604 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3605 wakaba 1.13 !!!next-input-character;
3606     redo A;
3607     } elsif ($self->{nc} == 0x003F) { # ?
3608     $self->{state} = PI_STATE;
3609     !!!next-input-character;
3610     redo A;
3611     } elsif ($self->{nc} == -1) {
3612     !!!parse-error (type => 'bare stago');
3613     $self->{state} = DATA_STATE;
3614     $self->{s_kwd} = '';
3615     ## Reconsume.
3616     redo A;
3617     } else {
3618     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3619     line => $self->{line_prev},
3620     column => $self->{column_prev});
3621     $self->{state} = BOGUS_COMMENT_STATE;
3622     $self->{ct} = {type => COMMENT_TOKEN,
3623     data => '',
3624     }; ## NOTE: Will be discarded.
3625 wakaba 1.12 !!!next-input-character;
3626     redo A;
3627     }
3628 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3629     ## XML5: "DOCTYPE markup declaration state".
3630    
3631     if ($self->{nc} == 0x002D) { # -
3632     $self->{state} = MD_HYPHEN_STATE;
3633     !!!next-input-character;
3634     redo A;
3635 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3636     $self->{nc} == 0x0065) { # e
3637 wakaba 1.14 $self->{state} = MD_E_STATE;
3638     $self->{kwd} = chr $self->{nc};
3639     !!!next-input-character;
3640     redo A;
3641 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3642     $self->{nc} == 0x0061) { # a
3643 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3644     $self->{kwd} = chr $self->{nc};
3645     !!!next-input-character;
3646     redo A;
3647 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3648     $self->{nc} == 0x006E) { # n
3649 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3650     $self->{kwd} = chr $self->{nc};
3651     !!!next-input-character;
3652     redo A;
3653     } else {
3654     #
3655     }
3656    
3657     ## XML5: No parse error.
3658     !!!parse-error (type => 'bogus comment',
3659     line => $self->{line_prev},
3660     column => $self->{column_prev} - 1);
3661     ## Reconsume.
3662     $self->{state} = BOGUS_COMMENT_STATE;
3663     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3664     redo A;
3665     } elsif ($self->{state} == MD_E_STATE) {
3666 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3667     $self->{nc} == 0x006E) { # n
3668 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3669     $self->{kwd} .= chr $self->{nc};
3670     !!!next-input-character;
3671     redo A;
3672 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3673     $self->{nc} == 0x006C) { # l
3674 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3675     $self->{state} = MD_ELEMENT_STATE;
3676     $self->{kwd} .= chr $self->{nc};
3677     !!!next-input-character;
3678     redo A;
3679     } else {
3680     ## XML5: No parse error.
3681     !!!parse-error (type => 'bogus comment',
3682     line => $self->{line_prev},
3683     column => $self->{column_prev} - 2
3684     + 1 * ($self->{nc} == -1));
3685     ## Reconsume.
3686     $self->{state} = BOGUS_COMMENT_STATE;
3687     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3688     redo A;
3689     }
3690     } elsif ($self->{state} == MD_ENTITY_STATE) {
3691 wakaba 1.17 if ($self->{nc} == [
3692     undef,
3693     undef,
3694     0x0054, # T
3695     0x0049, # I
3696     0x0054, # T
3697     ]->[length $self->{kwd}] or
3698     $self->{nc} == [
3699     undef,
3700     undef,
3701     0x0074, # t
3702     0x0069, # i
3703     0x0074, # t
3704     ]->[length $self->{kwd}]) {
3705 wakaba 1.14 ## Stay in the state.
3706     $self->{kwd} .= chr $self->{nc};
3707     !!!next-input-character;
3708     redo A;
3709 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3710     ($self->{nc} == 0x0059 or # Y
3711     $self->{nc} == 0x0079)) { # y
3712     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3713     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3714     text => 'ENTITY',
3715     line => $self->{line_prev},
3716     column => $self->{column_prev} - 4);
3717     }
3718     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3719 wakaba 1.14 line => $self->{line_prev},
3720     column => $self->{column_prev} - 6};
3721     $self->{state} = DOCTYPE_MD_STATE;
3722     !!!next-input-character;
3723     redo A;
3724     } else {
3725     !!!parse-error (type => 'bogus comment',
3726     line => $self->{line_prev},
3727     column => $self->{column_prev} - 1
3728     - (length $self->{kwd})
3729     + 1 * ($self->{nc} == -1));
3730     $self->{state} = BOGUS_COMMENT_STATE;
3731     ## Reconsume.
3732     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3733     redo A;
3734     }
3735     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3736 wakaba 1.17 if ($self->{nc} == [
3737     undef,
3738     undef,
3739     0x0045, # E
3740     0x004D, # M
3741     0x0045, # E
3742     0x004E, # N
3743     ]->[length $self->{kwd}] or
3744     $self->{nc} == [
3745     undef,
3746     undef,
3747     0x0065, # e
3748     0x006D, # m
3749     0x0065, # e
3750     0x006E, # n
3751     ]->[length $self->{kwd}]) {
3752 wakaba 1.14 ## Stay in the state.
3753     $self->{kwd} .= chr $self->{nc};
3754     !!!next-input-character;
3755     redo A;
3756 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3757     ($self->{nc} == 0x0054 or # T
3758     $self->{nc} == 0x0074)) { # t
3759     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3760     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3761     text => 'ELEMENT',
3762     line => $self->{line_prev},
3763     column => $self->{column_prev} - 5);
3764     }
3765 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3766     line => $self->{line_prev},
3767     column => $self->{column_prev} - 6};
3768     $self->{state} = DOCTYPE_MD_STATE;
3769     !!!next-input-character;
3770     redo A;
3771     } else {
3772     !!!parse-error (type => 'bogus comment',
3773     line => $self->{line_prev},
3774     column => $self->{column_prev} - 1
3775     - (length $self->{kwd})
3776     + 1 * ($self->{nc} == -1));
3777     $self->{state} = BOGUS_COMMENT_STATE;
3778     ## Reconsume.
3779     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3780     redo A;
3781     }
3782     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3783 wakaba 1.17 if ($self->{nc} == [
3784     undef,
3785     0x0054, # T
3786     0x0054, # T
3787     0x004C, # L
3788     0x0049, # I
3789     0x0053, # S
3790     ]->[length $self->{kwd}] or
3791     $self->{nc} == [
3792     undef,
3793     0x0074, # t
3794     0x0074, # t
3795     0x006C, # l
3796     0x0069, # i
3797     0x0073, # s
3798     ]->[length $self->{kwd}]) {
3799 wakaba 1.14 ## Stay in the state.
3800     $self->{kwd} .= chr $self->{nc};
3801     !!!next-input-character;
3802     redo A;
3803 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3804     ($self->{nc} == 0x0054 or # T
3805     $self->{nc} == 0x0074)) { # t
3806     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3807     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3808     text => 'ATTLIST',
3809     line => $self->{line_prev},
3810     column => $self->{column_prev} - 5);
3811     }
3812 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3813 wakaba 1.15 attrdefs => [],
3814 wakaba 1.14 line => $self->{line_prev},
3815     column => $self->{column_prev} - 6};
3816     $self->{state} = DOCTYPE_MD_STATE;
3817     !!!next-input-character;
3818     redo A;
3819     } else {
3820     !!!parse-error (type => 'bogus comment',
3821     line => $self->{line_prev},
3822     column => $self->{column_prev} - 1
3823     - (length $self->{kwd})
3824     + 1 * ($self->{nc} == -1));
3825     $self->{state} = BOGUS_COMMENT_STATE;
3826     ## Reconsume.
3827     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3828     redo A;
3829     }
3830     } elsif ($self->{state} == MD_NOTATION_STATE) {
3831 wakaba 1.17 if ($self->{nc} == [
3832     undef,
3833     0x004F, # O
3834     0x0054, # T
3835     0x0041, # A
3836     0x0054, # T
3837     0x0049, # I
3838     0x004F, # O
3839     ]->[length $self->{kwd}] or
3840     $self->{nc} == [
3841     undef,
3842     0x006F, # o
3843     0x0074, # t
3844     0x0061, # a
3845     0x0074, # t
3846     0x0069, # i
3847     0x006F, # o
3848     ]->[length $self->{kwd}]) {
3849 wakaba 1.14 ## Stay in the state.
3850     $self->{kwd} .= chr $self->{nc};
3851     !!!next-input-character;
3852     redo A;
3853 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
3854     ($self->{nc} == 0x004E or # N
3855     $self->{nc} == 0x006E)) { # n
3856     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3857     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3858     text => 'NOTATION',
3859     line => $self->{line_prev},
3860     column => $self->{column_prev} - 6);
3861     }
3862 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3863     line => $self->{line_prev},
3864     column => $self->{column_prev} - 6};
3865     $self->{state} = DOCTYPE_MD_STATE;
3866     !!!next-input-character;
3867     redo A;
3868     } else {
3869     !!!parse-error (type => 'bogus comment',
3870     line => $self->{line_prev},
3871     column => $self->{column_prev} - 1
3872     - (length $self->{kwd})
3873     + 1 * ($self->{nc} == -1));
3874     $self->{state} = BOGUS_COMMENT_STATE;
3875     ## Reconsume.
3876     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3877     redo A;
3878     }
3879     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3880     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3881     ## "DOCTYPE NOTATION state".
3882    
3883     if ($is_space->{$self->{nc}}) {
3884     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3885     $self->{state} = BEFORE_MD_NAME_STATE;
3886     !!!next-input-character;
3887     redo A;
3888     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3889     $self->{nc} == 0x0025) { # %
3890     ## XML5: Switch to the "DOCTYPE bogus comment state".
3891     !!!parse-error (type => 'no space before md name'); ## TODO: type
3892     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3893     !!!next-input-character;
3894     redo A;
3895     } elsif ($self->{nc} == -1) {
3896     !!!parse-error (type => 'unclosed md'); ## TODO: type
3897     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3898     ## Reconsume.
3899     redo A;
3900     } elsif ($self->{nc} == 0x003E) { # >
3901     ## XML5: Switch to the "DOCTYPE bogus comment state".
3902     !!!parse-error (type => 'no md name'); ## TODO: type
3903     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3904     !!!next-input-character;
3905     redo A;
3906     } else {
3907     ## XML5: Switch to the "DOCTYPE bogus comment state".
3908     !!!parse-error (type => 'no space before md name'); ## TODO: type
3909     $self->{state} = BEFORE_MD_NAME_STATE;
3910     redo A;
3911     }
3912     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3913     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3914     ## before state", "DOCTYPE ATTLIST name before state".
3915    
3916     if ($is_space->{$self->{nc}}) {
3917     ## Stay in the state.
3918     !!!next-input-character;
3919     redo A;
3920     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3921     $self->{nc} == 0x0025) { # %
3922     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3923     !!!next-input-character;
3924     redo A;
3925     } elsif ($self->{nc} == 0x003E) { # >
3926     ## XML5: Same as "Anything else".
3927     !!!parse-error (type => 'no md name'); ## TODO: type
3928     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3929     !!!next-input-character;
3930     redo A;
3931     } elsif ($self->{nc} == -1) {
3932     !!!parse-error (type => 'unclosed md'); ## TODO: type
3933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3934     ## Reconsume.
3935     redo A;
3936     } else {
3937     ## XML5: [ATTLIST] Not defined yet.
3938     $self->{ct}->{name} .= chr $self->{nc};
3939     $self->{state} = MD_NAME_STATE;
3940     !!!next-input-character;
3941     redo A;
3942     }
3943     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3944     if ($is_space->{$self->{nc}}) {
3945     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3946     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3947     $self->{state} = BEFORE_MD_NAME_STATE;
3948     !!!next-input-character;
3949     redo A;
3950     } elsif ($self->{nc} == 0x003E) { # >
3951     ## XML5: Same as "Anything else".
3952     !!!parse-error (type => 'no md name'); ## TODO: type
3953     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954     !!!next-input-character;
3955     redo A;
3956     } elsif ($self->{nc} == -1) {
3957     !!!parse-error (type => 'unclosed md');
3958     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3959     ## Reconsume.
3960     redo A;
3961     } else {
3962     ## XML5: No parse error.
3963     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3964     $self->{state} = BOGUS_COMMENT_STATE;
3965     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3966     ## Reconsume.
3967     redo A;
3968     }
3969     } elsif ($self->{state} == MD_NAME_STATE) {
3970     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3971    
3972     if ($is_space->{$self->{nc}}) {
3973 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3974     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3975     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
3976     ## TODO: ...
3977     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3978     } else { # ENTITY/NOTATION
3979     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3980     }
3981 wakaba 1.14 !!!next-input-character;
3982     redo A;
3983     } elsif ($self->{nc} == 0x003E) { # >
3984     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3985     #
3986     } else {
3987 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
3988 wakaba 1.14 }
3989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3990     !!!next-input-character;
3991     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3992     redo A;
3993     } elsif ($self->{nc} == -1) {
3994     ## XML5: [ATTLIST] No parse error.
3995     !!!parse-error (type => 'unclosed md');
3996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3997     ## Reconsume.
3998     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3999     redo A;
4000     } else {
4001     ## XML5: [ATTLIST] Not defined yet.
4002     $self->{ct}->{name} .= chr $self->{nc};
4003     ## Stay in the state.
4004     !!!next-input-character;
4005     redo A;
4006     }
4007     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4008     if ($is_space->{$self->{nc}}) {
4009     ## Stay in the state.
4010     !!!next-input-character;
4011     redo A;
4012     } elsif ($self->{nc} == 0x003E) { # >
4013     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4014     !!!next-input-character;
4015     !!!emit ($self->{ct}); # ATTLIST
4016     redo A;
4017     } elsif ($self->{nc} == -1) {
4018     ## XML5: No parse error.
4019     !!!parse-error (type => 'unclosed md'); ## TODO: type
4020     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4021 wakaba 1.15 !!!emit ($self->{ct});
4022     redo A;
4023     } else {
4024     ## XML5: Not defined yet.
4025     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4026     tokens => [],
4027     line => $self->{line}, column => $self->{column}};
4028     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4029     !!!next-input-character;
4030     redo A;
4031     }
4032     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4033     if ($is_space->{$self->{nc}}) {
4034     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4035     !!!next-input-character;
4036     redo A;
4037     } elsif ($self->{nc} == 0x003E) { # >
4038     ## XML5: Same as "anything else".
4039     !!!parse-error (type => 'no attr type'); ## TODO: type
4040     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4041     !!!next-input-character;
4042     !!!emit ($self->{ct}); # ATTLIST
4043     redo A;
4044     } elsif ($self->{nc} == 0x0028) { # (
4045     ## XML5: Same as "anything else".
4046     !!!parse-error (type => 'no space before paren'); ## TODO: type
4047     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4048     !!!next-input-character;
4049     redo A;
4050     } elsif ($self->{nc} == -1) {
4051     ## XML5: No parse error.
4052     !!!parse-error (type => 'unclosed md'); ## TODO: type
4053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4054     !!!next-input-character;
4055     !!!emit ($self->{ct}); # ATTLIST
4056     redo A;
4057     } else {
4058     ## XML5: Not defined yet.
4059     $self->{ca}->{name} .= chr $self->{nc};
4060     ## Stay in the state.
4061     !!!next-input-character;
4062     redo A;
4063     }
4064     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4065     if ($is_space->{$self->{nc}}) {
4066     ## Stay in the state.
4067     !!!next-input-character;
4068     redo A;
4069     } elsif ($self->{nc} == 0x003E) { # >
4070     ## XML5: Same as "anything else".
4071     !!!parse-error (type => 'no attr type'); ## TODO: type
4072     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4073     !!!next-input-character;
4074     !!!emit ($self->{ct}); # ATTLIST
4075     redo A;
4076     } elsif ($self->{nc} == 0x0028) { # (
4077     ## XML5: Same as "anything else".
4078     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4079     !!!next-input-character;
4080     redo A;
4081     } elsif ($self->{nc} == -1) {
4082     ## XML5: No parse error.
4083     !!!parse-error (type => 'unclosed md'); ## TODO: type
4084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4085     !!!next-input-character;
4086     !!!emit ($self->{ct});
4087 wakaba 1.14 redo A;
4088     } else {
4089     ## XML5: Not defined yet.
4090 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4091     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4092     !!!next-input-character;
4093     redo A;
4094     }
4095     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4096     if ($is_space->{$self->{nc}}) {
4097     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4098     !!!next-input-character;
4099     redo A;
4100     } elsif ($self->{nc} == 0x0023) { # #
4101     ## XML5: Same as "anything else".
4102     !!!parse-error (type => 'no space before default value'); ## TODO: type
4103     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4104     !!!next-input-character;
4105     redo A;
4106     } elsif ($self->{nc} == 0x0022) { # "
4107     ## XML5: Same as "anything else".
4108     !!!parse-error (type => 'no space before default value'); ## TODO: type
4109     $self->{ca}->{value} = '';
4110     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4111     !!!next-input-character;
4112     redo A;
4113     } elsif ($self->{nc} == 0x0027) { # '
4114     ## XML5: Same as "anything else".
4115     !!!parse-error (type => 'no space before default value'); ## TODO: type
4116     $self->{ca}->{value} = '';
4117     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4118     !!!next-input-character;
4119     redo A;
4120     } elsif ($self->{nc} == 0x003E) { # >
4121     ## XML5: Same as "anything else".
4122     !!!parse-error (type => 'no attr default'); ## TODO: type
4123     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4124     !!!next-input-character;
4125     !!!emit ($self->{ct}); # ATTLIST
4126     redo A;
4127     } elsif ($self->{nc} == 0x0028) { # (
4128     ## XML5: Same as "anything else".
4129     !!!parse-error (type => 'no space before paren'); ## TODO: type
4130     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4131     !!!next-input-character;
4132     redo A;
4133     } elsif ($self->{nc} == -1) {
4134     ## XML5: No parse error.
4135     !!!parse-error (type => 'unclosed md'); ## TODO: type
4136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137     !!!next-input-character;
4138     !!!emit ($self->{ct});
4139     redo A;
4140     } else {
4141     ## XML5: Not defined yet.
4142     $self->{ca}->{type} .= chr $self->{nc};
4143     ## Stay in the state.
4144     !!!next-input-character;
4145     redo A;
4146     }
4147     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4148     if ($is_space->{$self->{nc}}) {
4149     ## Stay in the state.
4150     !!!next-input-character;
4151     redo A;
4152     } elsif ($self->{nc} == 0x0028) { # (
4153     ## XML5: Same as "anything else".
4154     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4155     !!!next-input-character;
4156     redo A;
4157     } elsif ($self->{nc} == 0x0023) { # #
4158     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4159     !!!next-input-character;
4160     redo A;
4161     } elsif ($self->{nc} == 0x0022) { # "
4162     ## XML5: Same as "anything else".
4163     $self->{ca}->{value} = '';
4164     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4165     !!!next-input-character;
4166     redo A;
4167     } elsif ($self->{nc} == 0x0027) { # '
4168     ## XML5: Same as "anything else".
4169     $self->{ca}->{value} = '';
4170     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4171     !!!next-input-character;
4172     redo A;
4173     } elsif ($self->{nc} == 0x003E) { # >
4174     ## XML5: Same as "anything else".
4175     !!!parse-error (type => 'no attr default'); ## TODO: type
4176     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4177     !!!next-input-character;
4178     !!!emit ($self->{ct}); # ATTLIST
4179     redo A;
4180     } elsif ($self->{nc} == -1) {
4181     ## XML5: No parse error.
4182     !!!parse-error (type => 'unclosed md'); ## TODO: type
4183     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4184     !!!next-input-character;
4185     !!!emit ($self->{ct});
4186     redo A;
4187     } else {
4188     ## XML5: Switch to the "DOCTYPE bogus comment state".
4189     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4190     $self->{ca}->{value} = '';
4191     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4192     ## Reconsume.
4193     redo A;
4194     }
4195     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4196     if ($is_space->{$self->{nc}}) {
4197     ## Stay in the state.
4198     !!!next-input-character;
4199     redo A;
4200     } elsif ($self->{nc} == 0x007C) { # |
4201     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4202     ## Stay in the state.
4203     !!!next-input-character;
4204     redo A;
4205     } elsif ($self->{nc} == 0x0029) { # )
4206     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4207     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4208     !!!next-input-character;
4209     redo A;
4210     } elsif ($self->{nc} == 0x003E) { # >
4211     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4212     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4213     !!!next-input-character;
4214     !!!emit ($self->{ct}); # ATTLIST
4215     redo A;
4216     } elsif ($self->{nc} == -1) {
4217     ## XML5: No parse error.
4218     !!!parse-error (type => 'unclosed md'); ## TODO: type
4219     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4220     !!!next-input-character;
4221     !!!emit ($self->{ct});
4222     redo A;
4223     } else {
4224     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4225     $self->{state} = ALLOWED_TOKEN_STATE;
4226     !!!next-input-character;
4227     redo A;
4228     }
4229     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4230     if ($is_space->{$self->{nc}}) {
4231     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4232     !!!next-input-character;
4233     redo A;
4234     } elsif ($self->{nc} == 0x007C) { # |
4235     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4236     !!!next-input-character;
4237     redo A;
4238     } elsif ($self->{nc} == 0x0029) { # )
4239     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4240     !!!next-input-character;
4241     redo A;
4242     } elsif ($self->{nc} == 0x003E) { # >
4243     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4244     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4245     !!!next-input-character;
4246     !!!emit ($self->{ct}); # ATTLIST
4247     redo A;
4248     } elsif ($self->{nc} == -1) {
4249     ## XML5: No parse error.
4250     !!!parse-error (type => 'unclosed md'); ## TODO: type
4251     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4252     !!!next-input-character;
4253     !!!emit ($self->{ct});
4254     redo A;
4255     } else {
4256     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4257     ## Stay in the state.
4258     !!!next-input-character;
4259     redo A;
4260     }
4261     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4262     if ($is_space->{$self->{nc}}) {
4263     ## Stay in the state.
4264     !!!next-input-character;
4265     redo A;
4266     } elsif ($self->{nc} == 0x007C) { # |
4267     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4268     !!!next-input-character;
4269     redo A;
4270     } elsif ($self->{nc} == 0x0029) { # )
4271     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4272     !!!next-input-character;
4273     redo A;
4274     } elsif ($self->{nc} == 0x003E) { # >
4275     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4276     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4277     !!!next-input-character;
4278     !!!emit ($self->{ct}); # ATTLIST
4279     redo A;
4280     } elsif ($self->{nc} == -1) {
4281     ## XML5: No parse error.
4282     !!!parse-error (type => 'unclosed md'); ## TODO: type
4283     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4284     !!!next-input-character;
4285     !!!emit ($self->{ct});
4286     redo A;
4287     } else {
4288     !!!parse-error (type => 'space in allowed token', ## TODO: type
4289     line => $self->{line_prev},
4290     column => $self->{column_prev});
4291     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4292     $self->{state} = ALLOWED_TOKEN_STATE;
4293     !!!next-input-character;
4294     redo A;
4295     }
4296     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4297     if ($is_space->{$self->{nc}}) {
4298     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4299     !!!next-input-character;
4300     redo A;
4301     } elsif ($self->{nc} == 0x0023) { # #
4302     !!!parse-error (type => 'no space before default value'); ## TODO: type
4303     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4304     !!!next-input-character;
4305     redo A;
4306     } elsif ($self->{nc} == 0x0022) { # "
4307     !!!parse-error (type => 'no space before default value'); ## TODO: type
4308     $self->{ca}->{value} = '';
4309     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4310     !!!next-input-character;
4311     redo A;
4312     } elsif ($self->{nc} == 0x0027) { # '
4313     !!!parse-error (type => 'no space before default value'); ## TODO: type
4314     $self->{ca}->{value} = '';
4315     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4316     !!!next-input-character;
4317     redo A;
4318     } elsif ($self->{nc} == 0x003E) { # >
4319     !!!parse-error (type => 'no attr default'); ## TODO: type
4320     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4321     !!!next-input-character;
4322     !!!emit ($self->{ct}); # ATTLIST
4323     redo A;
4324     } elsif ($self->{nc} == -1) {
4325     !!!parse-error (type => 'unclosed md'); ## TODO: type
4326     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4327     !!!next-input-character;
4328     !!!emit ($self->{ct});
4329     redo A;
4330     } else {
4331     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4332     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4333     ## Reconsume.
4334     redo A;
4335     }
4336     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4337     if ($is_space->{$self->{nc}}) {
4338     ## Stay in the state.
4339     !!!next-input-character;
4340     redo A;
4341     } elsif ($self->{nc} == 0x0023) { # #
4342     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4343     !!!next-input-character;
4344     redo A;
4345     } elsif ($self->{nc} == 0x0022) { # "
4346     $self->{ca}->{value} = '';
4347     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4348     !!!next-input-character;
4349     redo A;
4350     } elsif ($self->{nc} == 0x0027) { # '
4351     $self->{ca}->{value} = '';
4352     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4353     !!!next-input-character;
4354     redo A;
4355     } elsif ($self->{nc} == 0x003E) { # >
4356     !!!parse-error (type => 'no attr default'); ## TODO: type
4357     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4358     !!!next-input-character;
4359     !!!emit ($self->{ct}); # ATTLIST
4360     redo A;
4361     } elsif ($self->{nc} == -1) {
4362     !!!parse-error (type => 'unclosed md'); ## TODO: type
4363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364     !!!next-input-character;
4365     !!!emit ($self->{ct});
4366     redo A;
4367     } else {
4368     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4369     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4370     ## Reconsume.
4371     redo A;
4372     }
4373     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4374     if ($is_space->{$self->{nc}}) {
4375     ## XML5: No parse error.
4376     !!!parse-error (type => 'no default type'); ## TODO: type
4377 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4378 wakaba 1.14 ## Reconsume.
4379     redo A;
4380 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4381     ## XML5: Same as "anything else".
4382     $self->{ca}->{value} = '';
4383     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4384     !!!next-input-character;
4385     redo A;
4386     } elsif ($self->{nc} == 0x0027) { # '
4387     ## XML5: Same as "anything else".
4388     $self->{ca}->{value} = '';
4389     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4390     !!!next-input-character;
4391     redo A;
4392     } elsif ($self->{nc} == 0x003E) { # >
4393     ## XML5: Same as "anything else".
4394     !!!parse-error (type => 'no attr default'); ## TODO: type
4395     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4396     !!!next-input-character;
4397     !!!emit ($self->{ct}); # ATTLIST
4398     redo A;
4399     } elsif ($self->{nc} == -1) {
4400     ## XML5: No parse error.
4401     !!!parse-error (type => 'unclosed md'); ## TODO: type
4402     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4403     !!!next-input-character;
4404     !!!emit ($self->{ct});
4405     redo A;
4406     } else {
4407     $self->{ca}->{default} = chr $self->{nc};
4408     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4409     !!!next-input-character;
4410     redo A;
4411 wakaba 1.14 }
4412 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4413     if ($is_space->{$self->{nc}}) {
4414     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4415     !!!next-input-character;
4416     redo A;
4417     } elsif ($self->{nc} == 0x0022) { # "
4418     ## XML5: Same as "anything else".
4419     !!!parse-error (type => 'no space before default value'); ## TODO: type
4420     $self->{ca}->{value} = '';
4421     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4422     !!!next-input-character;
4423     redo A;
4424     } elsif ($self->{nc} == 0x0027) { # '
4425     ## XML5: Same as "anything else".
4426     !!!parse-error (type => 'no space before default value'); ## TODO: type
4427     $self->{ca}->{value} = '';
4428     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4429     !!!next-input-character;
4430     redo A;
4431     } elsif ($self->{nc} == 0x003E) { # >
4432     ## XML5: Same as "anything else".
4433     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4435     !!!next-input-character;
4436     !!!emit ($self->{ct}); # ATTLIST
4437     redo A;
4438     } elsif ($self->{nc} == -1) {
4439     ## XML5: No parse error.
4440     !!!parse-error (type => 'unclosed md'); ## TODO: type
4441     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4442     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4443     !!!next-input-character;
4444     !!!emit ($self->{ct});
4445     redo A;
4446     } else {
4447     $self->{ca}->{default} .= chr $self->{nc};
4448     ## Stay in the state.
4449     !!!next-input-character;
4450     redo A;
4451     }
4452     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4453     if ($is_space->{$self->{nc}}) {
4454     ## Stay in the state.
4455     !!!next-input-character;
4456     redo A;
4457     } elsif ($self->{nc} == 0x0022) { # "
4458     $self->{ca}->{value} = '';
4459     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4460     !!!next-input-character;
4461     redo A;
4462     } elsif ($self->{nc} == 0x0027) { # '
4463     $self->{ca}->{value} = '';
4464     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4465     !!!next-input-character;
4466     redo A;
4467     } elsif ($self->{nc} == 0x003E) { # >
4468     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4469     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470     !!!next-input-character;
4471     !!!emit ($self->{ct}); # ATTLIST
4472     redo A;
4473     } elsif ($self->{nc} == -1) {
4474     ## XML5: No parse error.
4475     !!!parse-error (type => 'unclosed md'); ## TODO: type
4476     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4478     !!!next-input-character;
4479     !!!emit ($self->{ct});
4480     redo A;
4481     } else {
4482     ## XML5: Not defined yet.
4483     if ($self->{ca}->{default} eq 'FIXED') {
4484     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4485     } else {
4486     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4487     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4488     }
4489     ## Reconsume.
4490     redo A;
4491     }
4492     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4493     if ($is_space->{$self->{nc}} or
4494     $self->{nc} == -1 or
4495     $self->{nc} == 0x003E) { # >
4496     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4497     ## Reconsume.
4498     redo A;
4499     } else {
4500     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4501     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4502     ## Reconsume.
4503     redo A;
4504 wakaba 1.16 }
4505    
4506     } elsif ($self->{state} == BOGUS_MD_STATE) {
4507     if ($self->{nc} == 0x003E) { # >
4508     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509     !!!next-input-character;
4510     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4511     redo A;
4512     } elsif ($self->{nc} == -1) {
4513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4514     ## Reconsume.
4515     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4516     redo A;
4517     } else {
4518     ## Stay in the state.
4519     !!!next-input-character;
4520     redo A;
4521     }
4522 wakaba 1.1 } else {
4523     die "$0: $self->{state}: Unknown state";
4524     }
4525     } # A
4526    
4527     die "$0: _get_next_token: unexpected case";
4528     } # _get_next_token
4529    
4530     1;
4531 wakaba 1.17 ## $Date: 2008/10/18 11:34:49 $
4532 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24