/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.30 - (hide annotations) (download) (as text)
Sun Aug 16 05:24:47 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.29: +13 -6 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	16 Aug 2009 05:21:53 -0000
	* tokenizer-test-1.test: "<" in attribute names are now parse
	errors (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	16 Aug 2009 05:23:17 -0000
	* Tokenizer.pm.src: Any "<" character in attribute names become
	parse error (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.30 our $VERSION=do{my @r=(q$Revision: 1.29 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276     !!!next-input-character;
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     !!!parse-error (type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345     !!!cp (0.1);
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353     !!!next-input-character;
354     redo A;
355     } elsif ($self->{nc} == 0x003C) { # <
356     !!!cp (0.2);
357     $self->{state} = TAG_OPEN_STATE;
358     !!!next-input-character;
359     redo A;
360     } elsif ($self->{nc} == -1) {
361     !!!cp (0.3);
362     !!!emit ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366     !!!cp (0.4);
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378     !!!next-input-character;
379     !!!emit ($token);
380     redo A;
381     } elsif ($self->{state} == DATA_STATE) {
382     $self->{s_kwd} = '' unless defined $self->{s_kwd};
383     if ($self->{nc} == 0x0026) { # &
384     $self->{s_kwd} = '';
385     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386     not $self->{escape}) {
387     !!!cp (1);
388     ## NOTE: In the spec, the tokenizer is switched to the
389     ## "entity data state". In this implementation, the tokenizer
390     ## is switched to the |ENTITY_STATE|, which is an implementation
391     ## of the "consume a character reference" algorithm.
392     $self->{entity_add} = -1;
393     $self->{prev_state} = DATA_STATE;
394     $self->{state} = ENTITY_STATE;
395     !!!next-input-character;
396     redo A;
397     } else {
398     !!!cp (2);
399     #
400     }
401     } elsif ($self->{nc} == 0x002D) { # -
402     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
404 wakaba 1.1 !!!cp (3);
405     $self->{escape} = 1; # unless $self->{escape};
406     $self->{s_kwd} = '--';
407     #
408 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
409 wakaba 1.1 !!!cp (4);
410     $self->{s_kwd} = '--';
411     #
412 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413     !!!cp (4.1);
414     $self->{s_kwd} .= '-';
415     #
416 wakaba 1.1 } else {
417     !!!cp (5);
418 wakaba 1.5 $self->{s_kwd} = '-';
419 wakaba 1.1 #
420     }
421     }
422    
423     #
424     } elsif ($self->{nc} == 0x0021) { # !
425     if (length $self->{s_kwd}) {
426     !!!cp (5.1);
427     $self->{s_kwd} .= '!';
428     #
429     } else {
430     !!!cp (5.2);
431     #$self->{s_kwd} = '';
432     #
433     }
434     #
435     } elsif ($self->{nc} == 0x003C) { # <
436     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438     not $self->{escape})) {
439     !!!cp (6);
440     $self->{state} = TAG_OPEN_STATE;
441     !!!next-input-character;
442     redo A;
443     } else {
444     !!!cp (7);
445     $self->{s_kwd} = '';
446     #
447     }
448     } elsif ($self->{nc} == 0x003E) { # >
449     if ($self->{escape} and
450     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451     if ($self->{s_kwd} eq '--') {
452     !!!cp (8);
453     delete $self->{escape};
454 wakaba 1.5 #
455 wakaba 1.1 } else {
456     !!!cp (9);
457 wakaba 1.5 #
458 wakaba 1.1 }
459 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460     !!!cp (9.1);
461     !!!parse-error (type => 'unmatched mse', ## TODO: type
462     line => $self->{line_prev},
463     column => $self->{column_prev} - 1);
464     #
465 wakaba 1.1 } else {
466     !!!cp (10);
467 wakaba 1.5 #
468 wakaba 1.1 }
469    
470     $self->{s_kwd} = '';
471     #
472 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
473     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474     !!!cp (10.1);
475     $self->{s_kwd} .= ']';
476     } elsif ($self->{s_kwd} eq ']]') {
477     !!!cp (10.2);
478     #
479     } else {
480     !!!cp (10.3);
481     $self->{s_kwd} = '';
482     }
483     #
484 wakaba 1.1 } elsif ($self->{nc} == -1) {
485     !!!cp (11);
486     $self->{s_kwd} = '';
487     !!!emit ({type => END_OF_FILE_TOKEN,
488     line => $self->{line}, column => $self->{column}});
489     last A; ## TODO: ok?
490     } else {
491     !!!cp (12);
492     $self->{s_kwd} = '';
493     #
494     }
495    
496     # Anything else
497     my $token = {type => CHARACTER_TOKEN,
498     data => chr $self->{nc},
499     line => $self->{line}, column => $self->{column},
500     };
501 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 wakaba 1.1 length $token->{data})) {
503     $self->{s_kwd} = '';
504     }
505    
506     ## Stay in the data state.
507 wakaba 1.5 if (not $self->{is_xml} and
508     $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 wakaba 1.1 !!!cp (13);
510     $self->{state} = PCDATA_STATE;
511     } else {
512     !!!cp (14);
513     ## Stay in the state.
514     }
515     !!!next-input-character;
516     !!!emit ($token);
517     redo A;
518     } elsif ($self->{state} == TAG_OPEN_STATE) {
519 wakaba 1.10 ## XML5: "tag state".
520    
521 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522     if ($self->{nc} == 0x002F) { # /
523     !!!cp (15);
524     !!!next-input-character;
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526     redo A;
527     } elsif ($self->{nc} == 0x0021) { # !
528     !!!cp (15.1);
529 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 wakaba 1.1 #
531     } else {
532     !!!cp (16);
533 wakaba 1.12 $self->{s_kwd} = '';
534 wakaba 1.1 #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539     !!!emit ({type => CHARACTER_TOKEN, data => '<',
540     line => $self->{line_prev},
541     column => $self->{column_prev},
542     });
543     redo A;
544     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545     if ($self->{nc} == 0x0021) { # !
546     !!!cp (17);
547     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548     !!!next-input-character;
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551     !!!cp (18);
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif (0x0041 <= $self->{nc} and
556     $self->{nc} <= 0x005A) { # A..Z
557     !!!cp (19);
558     $self->{ct}
559     = {type => START_TAG_TOKEN,
560 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 wakaba 1.1 line => $self->{line_prev},
562     column => $self->{column_prev}};
563     $self->{state} = TAG_NAME_STATE;
564     !!!next-input-character;
565     redo A;
566     } elsif (0x0061 <= $self->{nc} and
567     $self->{nc} <= 0x007A) { # a..z
568     !!!cp (20);
569     $self->{ct} = {type => START_TAG_TOKEN,
570     tag_name => chr ($self->{nc}),
571     line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (21);
578     !!!parse-error (type => 'empty start tag',
579     line => $self->{line_prev},
580     column => $self->{column_prev});
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584    
585     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586     line => $self->{line_prev},
587     column => $self->{column_prev},
588     });
589    
590     redo A;
591     } elsif ($self->{nc} == 0x003F) { # ?
592 wakaba 1.8 if ($self->{is_xml}) {
593     !!!cp (22.1);
594     $self->{state} = PI_STATE;
595     !!!next-input-character;
596     redo A;
597     } else {
598     !!!cp (22);
599     !!!parse-error (type => 'pio',
600     line => $self->{line_prev},
601     column => $self->{column_prev});
602     $self->{state} = BOGUS_COMMENT_STATE;
603     $self->{ct} = {type => COMMENT_TOKEN, data => '',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     };
607     ## $self->{nc} is intentionally left as is
608     redo A;
609     }
610 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 wakaba 1.1 !!!cp (23);
612     !!!parse-error (type => 'bare stago',
613     line => $self->{line_prev},
614     column => $self->{column_prev});
615     $self->{state} = DATA_STATE;
616 wakaba 1.5 $self->{s_kwd} = '';
617 wakaba 1.1 ## reconsume
618    
619     !!!emit ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623    
624     redo A;
625 wakaba 1.9 } else {
626     ## XML5: "<:" is a parse error.
627     !!!cp (23.1);
628     $self->{ct} = {type => START_TAG_TOKEN,
629     tag_name => chr ($self->{nc}),
630     line => $self->{line_prev},
631     column => $self->{column_prev}};
632     $self->{state} = TAG_NAME_STATE;
633     !!!next-input-character;
634     redo A;
635 wakaba 1.1 }
636     } else {
637     die "$0: $self->{content_model} in tag open";
638     }
639     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640     ## NOTE: The "close tag open state" in the spec is implemented as
641     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642    
643 wakaba 1.10 ## XML5: "end tag state".
644    
645 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647     if (defined $self->{last_stag_name}) {
648     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 wakaba 1.12 $self->{kwd} = '';
650 wakaba 1.1 ## Reconsume.
651     redo A;
652     } else {
653     ## No start tag token has ever been emitted
654     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655     !!!cp (28);
656     $self->{state} = DATA_STATE;
657 wakaba 1.5 $self->{s_kwd} = '';
658 wakaba 1.1 ## Reconsume.
659     !!!emit ({type => CHARACTER_TOKEN, data => '</',
660     line => $l, column => $c,
661     });
662     redo A;
663     }
664     }
665    
666     if (0x0041 <= $self->{nc} and
667     $self->{nc} <= 0x005A) { # A..Z
668     !!!cp (29);
669     $self->{ct}
670     = {type => END_TAG_TOKEN,
671 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 wakaba 1.1 line => $l, column => $c};
673     $self->{state} = TAG_NAME_STATE;
674     !!!next-input-character;
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678     !!!cp (30);
679     $self->{ct} = {type => END_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $l, column => $c};
682     $self->{state} = TAG_NAME_STATE;
683     !!!next-input-character;
684     redo A;
685     } elsif ($self->{nc} == 0x003E) { # >
686     !!!parse-error (type => 'empty end tag',
687     line => $self->{line_prev}, ## "<" in "</>"
688     column => $self->{column_prev} - 1);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.10 if ($self->{is_xml}) {
692     !!!cp (31);
693     ## XML5: No parse error.
694    
695     ## NOTE: This parser raises a parse error, since it supports
696     ## XML1, not XML5.
697    
698     ## NOTE: A short end tag token.
699     my $ct = {type => END_TAG_TOKEN,
700     tag_name => '',
701     line => $self->{line_prev},
702     column => $self->{column_prev} - 1,
703     };
704     !!!next-input-character;
705     !!!emit ($ct);
706     } else {
707     !!!cp (31.1);
708     !!!next-input-character;
709     }
710 wakaba 1.1 redo A;
711     } elsif ($self->{nc} == -1) {
712     !!!cp (32);
713     !!!parse-error (type => 'bare etago');
714 wakaba 1.5 $self->{s_kwd} = '';
715 wakaba 1.1 $self->{state} = DATA_STATE;
716     # reconsume
717    
718     !!!emit ({type => CHARACTER_TOKEN, data => '</',
719     line => $l, column => $c,
720     });
721    
722     redo A;
723 wakaba 1.10 } elsif (not $self->{is_xml} or
724     $is_space->{$self->{nc}}) {
725 wakaba 1.1 !!!cp (33);
726 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
727     line => $self->{line_prev}, # "<" of "</"
728     column => $self->{column_prev} - 1);
729 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741 wakaba 1.10 } else {
742     ## XML5: "</:" is a parse error.
743     !!!cp (30.1);
744     $self->{ct} = {type => END_TAG_TOKEN,
745     tag_name => chr ($self->{nc}),
746     line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748     !!!next-input-character;
749     redo A;
750 wakaba 1.1 }
751     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 wakaba 1.1 if (length $ch) {
754     my $CH = $ch;
755     $ch =~ tr/a-z/A-Z/;
756     my $nch = chr $self->{nc};
757     if ($nch eq $ch or $nch eq $CH) {
758     !!!cp (24);
759     ## Stay in the state.
760 wakaba 1.12 $self->{kwd} .= $nch;
761 wakaba 1.1 !!!next-input-character;
762     redo A;
763     } else {
764     !!!cp (25);
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 ## Reconsume.
768     !!!emit ({type => CHARACTER_TOKEN,
769 wakaba 1.12 data => '</' . $self->{kwd},
770 wakaba 1.1 line => $self->{line_prev},
771 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
772 wakaba 1.1 });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782     !!!cp (26);
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785 wakaba 1.5 $self->{s_kwd} = '';
786 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
787 wakaba 1.12 data => '</' . $self->{kwd},
788 wakaba 1.1 line => $self->{line_prev},
789 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
790 wakaba 1.1 });
791     redo A;
792     } else {
793     !!!cp (27);
794     $self->{ct}
795     = {type => END_TAG_TOKEN,
796     tag_name => $self->{last_stag_name},
797     line => $self->{line_prev},
798 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
800     ## Reconsume.
801     redo A;
802     }
803     }
804     } elsif ($self->{state} == TAG_NAME_STATE) {
805     if ($is_space->{$self->{nc}}) {
806     !!!cp (34);
807     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808     !!!next-input-character;
809     redo A;
810     } elsif ($self->{nc} == 0x003E) { # >
811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
812     !!!cp (35);
813     $self->{last_stag_name} = $self->{ct}->{tag_name};
814     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816     #if ($self->{ct}->{attributes}) {
817     # ## NOTE: This should never be reached.
818     # !!! cp (36);
819     # !!! parse-error (type => 'end tag attribute');
820     #} else {
821     !!!cp (37);
822     #}
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (38);
836 wakaba 1.4 $self->{ct}->{tag_name}
837     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 wakaba 1.1 # start tag or end tag
839     ## Stay in this state
840     !!!next-input-character;
841     redo A;
842     } elsif ($self->{nc} == -1) {
843     !!!parse-error (type => 'unclosed tag');
844     if ($self->{ct}->{type} == START_TAG_TOKEN) {
845     !!!cp (39);
846     $self->{last_stag_name} = $self->{ct}->{tag_name};
847     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849     #if ($self->{ct}->{attributes}) {
850     # ## NOTE: This state should never be reached.
851     # !!! cp (40);
852     # !!! parse-error (type => 'end tag attribute');
853     #} else {
854     !!!cp (41);
855     #}
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } elsif ($self->{nc} == 0x002F) { # /
867     !!!cp (42);
868     $self->{state} = SELF_CLOSING_START_TAG_STATE;
869     !!!next-input-character;
870     redo A;
871     } else {
872     !!!cp (44);
873     $self->{ct}->{tag_name} .= chr $self->{nc};
874     # start tag or end tag
875     ## Stay in the state
876     !!!next-input-character;
877     redo A;
878     }
879     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 wakaba 1.11 ## XML5: "Tag attribute name before state".
881    
882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
883     !!!cp (45);
884     ## Stay in the state
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == 0x003E) { # >
888     if ($self->{ct}->{type} == START_TAG_TOKEN) {
889     !!!cp (46);
890     $self->{last_stag_name} = $self->{ct}->{tag_name};
891     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893     if ($self->{ct}->{attributes}) {
894     !!!cp (47);
895     !!!parse-error (type => 'end tag attribute');
896     } else {
897     !!!cp (48);
898     }
899     } else {
900     die "$0: $self->{ct}->{type}: Unknown token type";
901     }
902     $self->{state} = DATA_STATE;
903 wakaba 1.5 $self->{s_kwd} = '';
904 wakaba 1.1 !!!next-input-character;
905    
906     !!!emit ($self->{ct}); # start tag or end tag
907    
908     redo A;
909     } elsif (0x0041 <= $self->{nc} and
910     $self->{nc} <= 0x005A) { # A..Z
911     !!!cp (49);
912     $self->{ca}
913 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 wakaba 1.1 value => '',
915     line => $self->{line}, column => $self->{column}};
916     $self->{state} = ATTRIBUTE_NAME_STATE;
917     !!!next-input-character;
918     redo A;
919     } elsif ($self->{nc} == 0x002F) { # /
920     !!!cp (50);
921     $self->{state} = SELF_CLOSING_START_TAG_STATE;
922     !!!next-input-character;
923     redo A;
924     } elsif ($self->{nc} == -1) {
925     !!!parse-error (type => 'unclosed tag');
926     if ($self->{ct}->{type} == START_TAG_TOKEN) {
927     !!!cp (52);
928     $self->{last_stag_name} = $self->{ct}->{tag_name};
929     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931     if ($self->{ct}->{attributes}) {
932     !!!cp (53);
933     !!!parse-error (type => 'end tag attribute');
934     } else {
935     !!!cp (54);
936     }
937     } else {
938     die "$0: $self->{ct}->{type}: Unknown token type";
939     }
940     $self->{state} = DATA_STATE;
941 wakaba 1.5 $self->{s_kwd} = '';
942 wakaba 1.1 # reconsume
943    
944     !!!emit ($self->{ct}); # start tag or end tag
945    
946     redo A;
947     } else {
948     if ({
949     0x0022 => 1, # "
950     0x0027 => 1, # '
951 wakaba 1.30 0x003C => 1, # <
952 wakaba 1.1 0x003D => 1, # =
953     }->{$self->{nc}}) {
954     !!!cp (55);
955 wakaba 1.11 ## XML5: Not a parse error.
956 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
957     } else {
958     !!!cp (56);
959 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
960 wakaba 1.1 }
961     $self->{ca}
962     = {name => chr ($self->{nc}),
963     value => '',
964     line => $self->{line}, column => $self->{column}};
965     $self->{state} = ATTRIBUTE_NAME_STATE;
966     !!!next-input-character;
967     redo A;
968     }
969     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
970 wakaba 1.11 ## XML5: "Tag attribute name state".
971    
972 wakaba 1.1 my $before_leave = sub {
973     if (exists $self->{ct}->{attributes} # start tag or end tag
974     ->{$self->{ca}->{name}}) { # MUST
975     !!!cp (57);
976     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
977     ## Discard $self->{ca} # MUST
978     } else {
979     !!!cp (58);
980     $self->{ct}->{attributes}->{$self->{ca}->{name}}
981     = $self->{ca};
982 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
983 wakaba 1.1 }
984     }; # $before_leave
985    
986     if ($is_space->{$self->{nc}}) {
987     !!!cp (59);
988     $before_leave->();
989     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
990     !!!next-input-character;
991     redo A;
992     } elsif ($self->{nc} == 0x003D) { # =
993     !!!cp (60);
994     $before_leave->();
995     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
996     !!!next-input-character;
997     redo A;
998     } elsif ($self->{nc} == 0x003E) { # >
999 wakaba 1.11 if ($self->{is_xml}) {
1000     !!!cp (60.1);
1001     ## XML5: Not a parse error.
1002     !!!parse-error (type => 'no attr value'); ## TODO: type
1003     } else {
1004     !!!cp (60.2);
1005     }
1006    
1007 wakaba 1.1 $before_leave->();
1008     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1009     !!!cp (61);
1010     $self->{last_stag_name} = $self->{ct}->{tag_name};
1011     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1012     !!!cp (62);
1013     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1014     if ($self->{ct}->{attributes}) {
1015     !!!parse-error (type => 'end tag attribute');
1016     }
1017     } else {
1018     die "$0: $self->{ct}->{type}: Unknown token type";
1019     }
1020     $self->{state} = DATA_STATE;
1021 wakaba 1.5 $self->{s_kwd} = '';
1022 wakaba 1.1 !!!next-input-character;
1023    
1024     !!!emit ($self->{ct}); # start tag or end tag
1025    
1026     redo A;
1027     } elsif (0x0041 <= $self->{nc} and
1028     $self->{nc} <= 0x005A) { # A..Z
1029     !!!cp (63);
1030 wakaba 1.4 $self->{ca}->{name}
1031     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1032 wakaba 1.1 ## Stay in the state
1033     !!!next-input-character;
1034     redo A;
1035     } elsif ($self->{nc} == 0x002F) { # /
1036 wakaba 1.11 if ($self->{is_xml}) {
1037     !!!cp (64);
1038     ## XML5: Not a parse error.
1039     !!!parse-error (type => 'no attr value'); ## TODO: type
1040     } else {
1041     !!!cp (64.1);
1042     }
1043    
1044 wakaba 1.1 $before_leave->();
1045     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1046     !!!next-input-character;
1047     redo A;
1048     } elsif ($self->{nc} == -1) {
1049     !!!parse-error (type => 'unclosed tag');
1050     $before_leave->();
1051     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1052     !!!cp (66);
1053     $self->{last_stag_name} = $self->{ct}->{tag_name};
1054     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1055     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1056     if ($self->{ct}->{attributes}) {
1057     !!!cp (67);
1058     !!!parse-error (type => 'end tag attribute');
1059     } else {
1060     ## NOTE: This state should never be reached.
1061     !!!cp (68);
1062     }
1063     } else {
1064     die "$0: $self->{ct}->{type}: Unknown token type";
1065     }
1066     $self->{state} = DATA_STATE;
1067 wakaba 1.5 $self->{s_kwd} = '';
1068 wakaba 1.1 # reconsume
1069    
1070     !!!emit ($self->{ct}); # start tag or end tag
1071    
1072     redo A;
1073     } else {
1074 wakaba 1.30 if ({
1075     0x0022 => 1, # "
1076     0x0027 => 1, # '
1077     0x003C => 1, # <
1078     }->{$self->{nc}}) {
1079 wakaba 1.1 !!!cp (69);
1080 wakaba 1.11 ## XML5: Not a parse error.
1081 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1082     } else {
1083     !!!cp (70);
1084     }
1085     $self->{ca}->{name} .= chr ($self->{nc});
1086     ## Stay in the state
1087     !!!next-input-character;
1088     redo A;
1089     }
1090     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1091 wakaba 1.11 ## XML5: "Tag attribute name after state".
1092    
1093 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1094     !!!cp (71);
1095     ## Stay in the state
1096     !!!next-input-character;
1097     redo A;
1098     } elsif ($self->{nc} == 0x003D) { # =
1099     !!!cp (72);
1100     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1101     !!!next-input-character;
1102     redo A;
1103     } elsif ($self->{nc} == 0x003E) { # >
1104 wakaba 1.11 if ($self->{is_xml}) {
1105     !!!cp (72.1);
1106     ## XML5: Not a parse error.
1107     !!!parse-error (type => 'no attr value'); ## TODO: type
1108     } else {
1109     !!!cp (72.2);
1110     }
1111    
1112 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1113     !!!cp (73);
1114     $self->{last_stag_name} = $self->{ct}->{tag_name};
1115     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1116     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1117     if ($self->{ct}->{attributes}) {
1118     !!!cp (74);
1119     !!!parse-error (type => 'end tag attribute');
1120     } else {
1121     ## NOTE: This state should never be reached.
1122     !!!cp (75);
1123     }
1124     } else {
1125     die "$0: $self->{ct}->{type}: Unknown token type";
1126     }
1127     $self->{state} = DATA_STATE;
1128 wakaba 1.5 $self->{s_kwd} = '';
1129 wakaba 1.1 !!!next-input-character;
1130    
1131     !!!emit ($self->{ct}); # start tag or end tag
1132    
1133     redo A;
1134     } elsif (0x0041 <= $self->{nc} and
1135     $self->{nc} <= 0x005A) { # A..Z
1136     !!!cp (76);
1137     $self->{ca}
1138 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1139 wakaba 1.1 value => '',
1140     line => $self->{line}, column => $self->{column}};
1141     $self->{state} = ATTRIBUTE_NAME_STATE;
1142     !!!next-input-character;
1143     redo A;
1144     } elsif ($self->{nc} == 0x002F) { # /
1145 wakaba 1.11 if ($self->{is_xml}) {
1146     !!!cp (77);
1147     ## XML5: Not a parse error.
1148     !!!parse-error (type => 'no attr value'); ## TODO: type
1149     } else {
1150     !!!cp (77.1);
1151     }
1152    
1153 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1154     !!!next-input-character;
1155     redo A;
1156     } elsif ($self->{nc} == -1) {
1157     !!!parse-error (type => 'unclosed tag');
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159     !!!cp (79);
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164     !!!cp (80);
1165     !!!parse-error (type => 'end tag attribute');
1166     } else {
1167     ## NOTE: This state should never be reached.
1168     !!!cp (81);
1169     }
1170     } else {
1171     die "$0: $self->{ct}->{type}: Unknown token type";
1172     }
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1 $self->{state} = DATA_STATE;
1175     # reconsume
1176    
1177     !!!emit ($self->{ct}); # start tag or end tag
1178    
1179     redo A;
1180     } else {
1181 wakaba 1.11 if ($self->{is_xml}) {
1182     !!!cp (78.1);
1183     ## XML5: Not a parse error.
1184     !!!parse-error (type => 'no attr value'); ## TODO: type
1185     } else {
1186     !!!cp (78.2);
1187     }
1188    
1189 wakaba 1.30 if ({
1190     0x0022 => 1, # "
1191     0x0027 => 1, # '
1192     0x003C => 1, # <
1193     }->{$self->{nc}}) {
1194 wakaba 1.1 !!!cp (78);
1195 wakaba 1.11 ## XML5: Not a parse error.
1196 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1197     } else {
1198     !!!cp (82);
1199     }
1200     $self->{ca}
1201     = {name => chr ($self->{nc}),
1202     value => '',
1203     line => $self->{line}, column => $self->{column}};
1204     $self->{state} = ATTRIBUTE_NAME_STATE;
1205     !!!next-input-character;
1206     redo A;
1207     }
1208     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1209 wakaba 1.11 ## XML5: "Tag attribute value before state".
1210    
1211 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1212     !!!cp (83);
1213     ## Stay in the state
1214     !!!next-input-character;
1215     redo A;
1216     } elsif ($self->{nc} == 0x0022) { # "
1217     !!!cp (84);
1218     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1219     !!!next-input-character;
1220     redo A;
1221     } elsif ($self->{nc} == 0x0026) { # &
1222     !!!cp (85);
1223     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1224     ## reconsume
1225     redo A;
1226     } elsif ($self->{nc} == 0x0027) { # '
1227     !!!cp (86);
1228     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1229     !!!next-input-character;
1230     redo A;
1231     } elsif ($self->{nc} == 0x003E) { # >
1232     !!!parse-error (type => 'empty unquoted attribute value');
1233     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1234     !!!cp (87);
1235     $self->{last_stag_name} = $self->{ct}->{tag_name};
1236     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1237     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1238     if ($self->{ct}->{attributes}) {
1239     !!!cp (88);
1240     !!!parse-error (type => 'end tag attribute');
1241     } else {
1242     ## NOTE: This state should never be reached.
1243     !!!cp (89);
1244     }
1245     } else {
1246     die "$0: $self->{ct}->{type}: Unknown token type";
1247     }
1248     $self->{state} = DATA_STATE;
1249 wakaba 1.5 $self->{s_kwd} = '';
1250 wakaba 1.1 !!!next-input-character;
1251    
1252     !!!emit ($self->{ct}); # start tag or end tag
1253    
1254     redo A;
1255     } elsif ($self->{nc} == -1) {
1256     !!!parse-error (type => 'unclosed tag');
1257     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1258     !!!cp (90);
1259     $self->{last_stag_name} = $self->{ct}->{tag_name};
1260     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1261     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1262     if ($self->{ct}->{attributes}) {
1263     !!!cp (91);
1264     !!!parse-error (type => 'end tag attribute');
1265     } else {
1266     ## NOTE: This state should never be reached.
1267     !!!cp (92);
1268     }
1269     } else {
1270     die "$0: $self->{ct}->{type}: Unknown token type";
1271     }
1272     $self->{state} = DATA_STATE;
1273 wakaba 1.5 $self->{s_kwd} = '';
1274 wakaba 1.1 ## reconsume
1275    
1276     !!!emit ($self->{ct}); # start tag or end tag
1277    
1278     redo A;
1279     } else {
1280 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1281 wakaba 1.1 !!!cp (93);
1282 wakaba 1.11 ## XML5: Not a parse error.
1283 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1284 wakaba 1.11 } elsif ($self->{is_xml}) {
1285     !!!cp (93.1);
1286     ## XML5: No parse error.
1287     !!!parse-error (type => 'unquoted attr value'); ## TODO
1288 wakaba 1.1 } else {
1289     !!!cp (94);
1290     }
1291     $self->{ca}->{value} .= chr ($self->{nc});
1292     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1293     !!!next-input-character;
1294     redo A;
1295     }
1296     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1297 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1298     ## ATTLIST attribute value double quoted state".
1299 wakaba 1.11
1300 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1301 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1302     !!!cp (95.1);
1303     ## XML5: "DOCTYPE ATTLIST name after state".
1304     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1305     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1306     } else {
1307     !!!cp (95);
1308     ## XML5: "Tag attribute name before state".
1309     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1310     }
1311 wakaba 1.1 !!!next-input-character;
1312     redo A;
1313     } elsif ($self->{nc} == 0x0026) { # &
1314     !!!cp (96);
1315 wakaba 1.11 ## XML5: Not defined yet.
1316    
1317 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1318     ## "entity in attribute value state". In this implementation, the
1319     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1320     ## implementation of the "consume a character reference" algorithm.
1321     $self->{prev_state} = $self->{state};
1322     $self->{entity_add} = 0x0022; # "
1323     $self->{state} = ENTITY_STATE;
1324     !!!next-input-character;
1325     redo A;
1326 wakaba 1.25 } elsif ($self->{is_xml} and
1327     $is_space->{$self->{nc}}) {
1328     !!!cp (97.1);
1329     $self->{ca}->{value} .= ' ';
1330     ## Stay in the state.
1331     !!!next-input-character;
1332     redo A;
1333 wakaba 1.1 } elsif ($self->{nc} == -1) {
1334     !!!parse-error (type => 'unclosed attribute value');
1335     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1336     !!!cp (97);
1337     $self->{last_stag_name} = $self->{ct}->{tag_name};
1338 wakaba 1.15
1339     $self->{state} = DATA_STATE;
1340     $self->{s_kwd} = '';
1341     ## reconsume
1342     !!!emit ($self->{ct}); # start tag
1343     redo A;
1344 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1345     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1346     if ($self->{ct}->{attributes}) {
1347     !!!cp (98);
1348     !!!parse-error (type => 'end tag attribute');
1349     } else {
1350     ## NOTE: This state should never be reached.
1351     !!!cp (99);
1352     }
1353 wakaba 1.15
1354     $self->{state} = DATA_STATE;
1355     $self->{s_kwd} = '';
1356     ## reconsume
1357     !!!emit ($self->{ct}); # end tag
1358     redo A;
1359     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1360     ## XML5: No parse error above; not defined yet.
1361     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1362     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1363     ## Reconsume.
1364     !!!emit ($self->{ct}); # ATTLIST
1365     redo A;
1366 wakaba 1.1 } else {
1367     die "$0: $self->{ct}->{type}: Unknown token type";
1368     }
1369     } else {
1370 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1371 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1372     !!!cp (100);
1373     ## XML5: Not a parse error.
1374     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1375     } else {
1376     !!!cp (100.1);
1377     }
1378 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1379     $self->{read_until}->($self->{ca}->{value},
1380 wakaba 1.25 qq["&<\x09\x0C\x20],
1381 wakaba 1.1 length $self->{ca}->{value});
1382    
1383     ## Stay in the state
1384     !!!next-input-character;
1385     redo A;
1386     }
1387     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1388 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1389     ## ATTLIST attribute value single quoted state".
1390 wakaba 1.11
1391 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1392 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1393     !!!cp (101.1);
1394     ## XML5: "DOCTYPE ATTLIST name after state".
1395     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1396     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1397     } else {
1398     !!!cp (101);
1399     ## XML5: "Before attribute name state" (sic).
1400     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1401     }
1402 wakaba 1.1 !!!next-input-character;
1403     redo A;
1404     } elsif ($self->{nc} == 0x0026) { # &
1405     !!!cp (102);
1406 wakaba 1.11 ## XML5: Not defined yet.
1407    
1408 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1409     ## "entity in attribute value state". In this implementation, the
1410     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1411     ## implementation of the "consume a character reference" algorithm.
1412     $self->{entity_add} = 0x0027; # '
1413     $self->{prev_state} = $self->{state};
1414     $self->{state} = ENTITY_STATE;
1415     !!!next-input-character;
1416     redo A;
1417 wakaba 1.25 } elsif ($self->{is_xml} and
1418     $is_space->{$self->{nc}}) {
1419     !!!cp (103.1);
1420     $self->{ca}->{value} .= ' ';
1421     ## Stay in the state.
1422     !!!next-input-character;
1423     redo A;
1424 wakaba 1.1 } elsif ($self->{nc} == -1) {
1425     !!!parse-error (type => 'unclosed attribute value');
1426     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1427     !!!cp (103);
1428     $self->{last_stag_name} = $self->{ct}->{tag_name};
1429 wakaba 1.15
1430     $self->{state} = DATA_STATE;
1431     $self->{s_kwd} = '';
1432     ## reconsume
1433     !!!emit ($self->{ct}); # start tag
1434     redo A;
1435 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1436     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1437     if ($self->{ct}->{attributes}) {
1438     !!!cp (104);
1439     !!!parse-error (type => 'end tag attribute');
1440     } else {
1441     ## NOTE: This state should never be reached.
1442     !!!cp (105);
1443     }
1444 wakaba 1.15
1445     $self->{state} = DATA_STATE;
1446     $self->{s_kwd} = '';
1447     ## reconsume
1448     !!!emit ($self->{ct}); # end tag
1449     redo A;
1450     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1451     ## XML5: No parse error above; not defined yet.
1452     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1453     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1454     ## Reconsume.
1455     !!!emit ($self->{ct}); # ATTLIST
1456     redo A;
1457 wakaba 1.1 } else {
1458     die "$0: $self->{ct}->{type}: Unknown token type";
1459     }
1460     } else {
1461 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1462 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1463     !!!cp (106);
1464     ## XML5: Not a parse error.
1465     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1466     } else {
1467     !!!cp (106.1);
1468     }
1469 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1470     $self->{read_until}->($self->{ca}->{value},
1471 wakaba 1.25 qq['&<\x09\x0C\x20],
1472 wakaba 1.1 length $self->{ca}->{value});
1473    
1474     ## Stay in the state
1475     !!!next-input-character;
1476     redo A;
1477     }
1478     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1479 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1480    
1481 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1482 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1483     !!!cp (107.1);
1484     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1485     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1486     } else {
1487     !!!cp (107);
1488     ## XML5: "Tag attribute name before state".
1489     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1490     }
1491 wakaba 1.1 !!!next-input-character;
1492     redo A;
1493     } elsif ($self->{nc} == 0x0026) { # &
1494     !!!cp (108);
1495 wakaba 1.11
1496     ## XML5: Not defined yet.
1497    
1498 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1499     ## "entity in attribute value state". In this implementation, the
1500     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1501     ## implementation of the "consume a character reference" algorithm.
1502     $self->{entity_add} = -1;
1503     $self->{prev_state} = $self->{state};
1504     $self->{state} = ENTITY_STATE;
1505     !!!next-input-character;
1506     redo A;
1507     } elsif ($self->{nc} == 0x003E) { # >
1508     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1509     !!!cp (109);
1510     $self->{last_stag_name} = $self->{ct}->{tag_name};
1511 wakaba 1.15
1512     $self->{state} = DATA_STATE;
1513     $self->{s_kwd} = '';
1514     !!!next-input-character;
1515     !!!emit ($self->{ct}); # start tag
1516     redo A;
1517 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1518     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1519     if ($self->{ct}->{attributes}) {
1520     !!!cp (110);
1521     !!!parse-error (type => 'end tag attribute');
1522     } else {
1523     ## NOTE: This state should never be reached.
1524     !!!cp (111);
1525     }
1526 wakaba 1.15
1527     $self->{state} = DATA_STATE;
1528     $self->{s_kwd} = '';
1529     !!!next-input-character;
1530     !!!emit ($self->{ct}); # end tag
1531     redo A;
1532     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1533     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1534     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1535     !!!next-input-character;
1536     !!!emit ($self->{ct}); # ATTLIST
1537     redo A;
1538 wakaba 1.1 } else {
1539     die "$0: $self->{ct}->{type}: Unknown token type";
1540     }
1541     } elsif ($self->{nc} == -1) {
1542     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1543     !!!cp (112);
1544 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1545 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
1546 wakaba 1.15
1547     $self->{state} = DATA_STATE;
1548     $self->{s_kwd} = '';
1549     ## reconsume
1550     !!!emit ($self->{ct}); # start tag
1551     redo A;
1552 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1553 wakaba 1.15 !!!parse-error (type => 'unclosed tag');
1554 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1555     if ($self->{ct}->{attributes}) {
1556     !!!cp (113);
1557     !!!parse-error (type => 'end tag attribute');
1558     } else {
1559     ## NOTE: This state should never be reached.
1560     !!!cp (114);
1561     }
1562 wakaba 1.15
1563     $self->{state} = DATA_STATE;
1564     $self->{s_kwd} = '';
1565     ## reconsume
1566     !!!emit ($self->{ct}); # end tag
1567     redo A;
1568     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1569     !!!parse-error (type => 'unclosed md'); ## TODO: type
1570     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1572     ## Reconsume.
1573     !!!emit ($self->{ct}); # ATTLIST
1574     redo A;
1575 wakaba 1.1 } else {
1576     die "$0: $self->{ct}->{type}: Unknown token type";
1577     }
1578     } else {
1579     if ({
1580     0x0022 => 1, # "
1581     0x0027 => 1, # '
1582     0x003D => 1, # =
1583 wakaba 1.26 0x003C => 1, # <
1584 wakaba 1.1 }->{$self->{nc}}) {
1585     !!!cp (115);
1586 wakaba 1.11 ## XML5: Not a parse error.
1587 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1588     } else {
1589     !!!cp (116);
1590     }
1591     $self->{ca}->{value} .= chr ($self->{nc});
1592     $self->{read_until}->($self->{ca}->{value},
1593 wakaba 1.25 qq["'=& \x09\x0C>],
1594 wakaba 1.1 length $self->{ca}->{value});
1595    
1596     ## Stay in the state
1597     !!!next-input-character;
1598     redo A;
1599     }
1600     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1601     if ($is_space->{$self->{nc}}) {
1602     !!!cp (118);
1603     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1604     !!!next-input-character;
1605     redo A;
1606     } elsif ($self->{nc} == 0x003E) { # >
1607     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1608     !!!cp (119);
1609     $self->{last_stag_name} = $self->{ct}->{tag_name};
1610     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1611     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1612     if ($self->{ct}->{attributes}) {
1613     !!!cp (120);
1614     !!!parse-error (type => 'end tag attribute');
1615     } else {
1616     ## NOTE: This state should never be reached.
1617     !!!cp (121);
1618     }
1619     } else {
1620     die "$0: $self->{ct}->{type}: Unknown token type";
1621     }
1622     $self->{state} = DATA_STATE;
1623 wakaba 1.5 $self->{s_kwd} = '';
1624 wakaba 1.1 !!!next-input-character;
1625    
1626     !!!emit ($self->{ct}); # start tag or end tag
1627    
1628     redo A;
1629     } elsif ($self->{nc} == 0x002F) { # /
1630     !!!cp (122);
1631     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1632     !!!next-input-character;
1633     redo A;
1634     } elsif ($self->{nc} == -1) {
1635     !!!parse-error (type => 'unclosed tag');
1636     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1637     !!!cp (122.3);
1638     $self->{last_stag_name} = $self->{ct}->{tag_name};
1639     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1640     if ($self->{ct}->{attributes}) {
1641     !!!cp (122.1);
1642     !!!parse-error (type => 'end tag attribute');
1643     } else {
1644     ## NOTE: This state should never be reached.
1645     !!!cp (122.2);
1646     }
1647     } else {
1648     die "$0: $self->{ct}->{type}: Unknown token type";
1649     }
1650     $self->{state} = DATA_STATE;
1651 wakaba 1.5 $self->{s_kwd} = '';
1652 wakaba 1.1 ## Reconsume.
1653     !!!emit ($self->{ct}); # start tag or end tag
1654     redo A;
1655     } else {
1656     !!!cp ('124.1');
1657     !!!parse-error (type => 'no space between attributes');
1658     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1659     ## reconsume
1660     redo A;
1661     }
1662     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1663 wakaba 1.11 ## XML5: "Empty tag state".
1664    
1665 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1666     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1667     !!!cp ('124.2');
1668     !!!parse-error (type => 'nestc', token => $self->{ct});
1669     ## TODO: Different type than slash in start tag
1670     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1671     if ($self->{ct}->{attributes}) {
1672     !!!cp ('124.4');
1673     !!!parse-error (type => 'end tag attribute');
1674     } else {
1675     !!!cp ('124.5');
1676     }
1677     ## TODO: Test |<title></title/>|
1678     } else {
1679     !!!cp ('124.3');
1680     $self->{self_closing} = 1;
1681     }
1682    
1683     $self->{state} = DATA_STATE;
1684 wakaba 1.5 $self->{s_kwd} = '';
1685 wakaba 1.1 !!!next-input-character;
1686    
1687     !!!emit ($self->{ct}); # start tag or end tag
1688    
1689     redo A;
1690     } elsif ($self->{nc} == -1) {
1691     !!!parse-error (type => 'unclosed tag');
1692     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1693     !!!cp (124.7);
1694     $self->{last_stag_name} = $self->{ct}->{tag_name};
1695     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1696     if ($self->{ct}->{attributes}) {
1697     !!!cp (124.5);
1698     !!!parse-error (type => 'end tag attribute');
1699     } else {
1700     ## NOTE: This state should never be reached.
1701     !!!cp (124.6);
1702     }
1703     } else {
1704     die "$0: $self->{ct}->{type}: Unknown token type";
1705     }
1706 wakaba 1.11 ## XML5: "Tag attribute name before state".
1707 wakaba 1.1 $self->{state} = DATA_STATE;
1708 wakaba 1.5 $self->{s_kwd} = '';
1709 wakaba 1.1 ## Reconsume.
1710     !!!emit ($self->{ct}); # start tag or end tag
1711     redo A;
1712     } else {
1713     !!!cp ('124.4');
1714     !!!parse-error (type => 'nestc');
1715     ## TODO: This error type is wrong.
1716     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1717     ## Reconsume.
1718     redo A;
1719     }
1720     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1721 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1722    
1723 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1724     ## consumes characters one-by-one basis.
1725    
1726     if ($self->{nc} == 0x003E) { # >
1727 wakaba 1.13 if ($self->{in_subset}) {
1728     !!!cp (123);
1729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1730     } else {
1731     !!!cp (124);
1732     $self->{state} = DATA_STATE;
1733     $self->{s_kwd} = '';
1734     }
1735 wakaba 1.1 !!!next-input-character;
1736    
1737     !!!emit ($self->{ct}); # comment
1738     redo A;
1739     } elsif ($self->{nc} == -1) {
1740 wakaba 1.13 if ($self->{in_subset}) {
1741     !!!cp (125.1);
1742     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1743     } else {
1744     !!!cp (125);
1745     $self->{state} = DATA_STATE;
1746     $self->{s_kwd} = '';
1747     }
1748 wakaba 1.1 ## reconsume
1749    
1750     !!!emit ($self->{ct}); # comment
1751     redo A;
1752     } else {
1753     !!!cp (126);
1754     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1755     $self->{read_until}->($self->{ct}->{data},
1756     q[>],
1757     length $self->{ct}->{data});
1758    
1759     ## Stay in the state.
1760     !!!next-input-character;
1761     redo A;
1762     }
1763     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1764 wakaba 1.14 ## XML5: "Markup declaration state".
1765 wakaba 1.1
1766     if ($self->{nc} == 0x002D) { # -
1767     !!!cp (133);
1768     $self->{state} = MD_HYPHEN_STATE;
1769     !!!next-input-character;
1770     redo A;
1771     } elsif ($self->{nc} == 0x0044 or # D
1772     $self->{nc} == 0x0064) { # d
1773     ## ASCII case-insensitive.
1774     !!!cp (130);
1775     $self->{state} = MD_DOCTYPE_STATE;
1776 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1777 wakaba 1.1 !!!next-input-character;
1778     redo A;
1779 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1780     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1781     $self->{is_xml}) and
1782 wakaba 1.1 $self->{nc} == 0x005B) { # [
1783     !!!cp (135.4);
1784     $self->{state} = MD_CDATA_STATE;
1785 wakaba 1.12 $self->{kwd} = '[';
1786 wakaba 1.1 !!!next-input-character;
1787     redo A;
1788     } else {
1789     !!!cp (136);
1790     }
1791    
1792     !!!parse-error (type => 'bogus comment',
1793     line => $self->{line_prev},
1794     column => $self->{column_prev} - 1);
1795     ## Reconsume.
1796     $self->{state} = BOGUS_COMMENT_STATE;
1797     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1798     line => $self->{line_prev},
1799     column => $self->{column_prev} - 1,
1800     };
1801     redo A;
1802     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1803     if ($self->{nc} == 0x002D) { # -
1804     !!!cp (127);
1805     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1806     line => $self->{line_prev},
1807     column => $self->{column_prev} - 2,
1808     };
1809 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1810 wakaba 1.1 !!!next-input-character;
1811     redo A;
1812     } else {
1813     !!!cp (128);
1814     !!!parse-error (type => 'bogus comment',
1815     line => $self->{line_prev},
1816     column => $self->{column_prev} - 2);
1817     $self->{state} = BOGUS_COMMENT_STATE;
1818     ## Reconsume.
1819     $self->{ct} = {type => COMMENT_TOKEN,
1820     data => '-',
1821     line => $self->{line_prev},
1822     column => $self->{column_prev} - 2,
1823     };
1824     redo A;
1825     }
1826     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1827     ## ASCII case-insensitive.
1828     if ($self->{nc} == [
1829     undef,
1830     0x004F, # O
1831     0x0043, # C
1832     0x0054, # T
1833     0x0059, # Y
1834     0x0050, # P
1835 wakaba 1.12 ]->[length $self->{kwd}] or
1836 wakaba 1.1 $self->{nc} == [
1837     undef,
1838     0x006F, # o
1839     0x0063, # c
1840     0x0074, # t
1841     0x0079, # y
1842     0x0070, # p
1843 wakaba 1.12 ]->[length $self->{kwd}]) {
1844 wakaba 1.1 !!!cp (131);
1845     ## Stay in the state.
1846 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1847 wakaba 1.1 !!!next-input-character;
1848     redo A;
1849 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1850 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1851     $self->{nc} == 0x0065)) { # e
1852 wakaba 1.12 if ($self->{is_xml} and
1853     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1854 wakaba 1.10 !!!cp (129);
1855     ## XML5: case-sensitive.
1856     !!!parse-error (type => 'lowercase keyword', ## TODO
1857     text => 'DOCTYPE',
1858     line => $self->{line_prev},
1859     column => $self->{column_prev} - 5);
1860     } else {
1861     !!!cp (129.1);
1862     }
1863 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1864     $self->{ct} = {type => DOCTYPE_TOKEN,
1865     quirks => 1,
1866     line => $self->{line_prev},
1867     column => $self->{column_prev} - 7,
1868     };
1869     !!!next-input-character;
1870     redo A;
1871     } else {
1872     !!!cp (132);
1873     !!!parse-error (type => 'bogus comment',
1874     line => $self->{line_prev},
1875 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1876 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1877     ## Reconsume.
1878     $self->{ct} = {type => COMMENT_TOKEN,
1879 wakaba 1.12 data => $self->{kwd},
1880 wakaba 1.1 line => $self->{line_prev},
1881 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1882 wakaba 1.1 };
1883     redo A;
1884     }
1885     } elsif ($self->{state} == MD_CDATA_STATE) {
1886     if ($self->{nc} == {
1887     '[' => 0x0043, # C
1888     '[C' => 0x0044, # D
1889     '[CD' => 0x0041, # A
1890     '[CDA' => 0x0054, # T
1891     '[CDAT' => 0x0041, # A
1892 wakaba 1.12 }->{$self->{kwd}}) {
1893 wakaba 1.1 !!!cp (135.1);
1894     ## Stay in the state.
1895 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1896 wakaba 1.1 !!!next-input-character;
1897     redo A;
1898 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1899 wakaba 1.1 $self->{nc} == 0x005B) { # [
1900 wakaba 1.6 if ($self->{is_xml} and
1901     not $self->{tainted} and
1902     @{$self->{open_elements} or []} == 0) {
1903 wakaba 1.8 !!!cp (135.2);
1904 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1905     line => $self->{line_prev},
1906     column => $self->{column_prev} - 7);
1907     $self->{tainted} = 1;
1908 wakaba 1.8 } else {
1909     !!!cp (135.21);
1910 wakaba 1.6 }
1911    
1912 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1913     data => '',
1914     line => $self->{line_prev},
1915     column => $self->{column_prev} - 7};
1916     $self->{state} = CDATA_SECTION_STATE;
1917     !!!next-input-character;
1918     redo A;
1919     } else {
1920     !!!cp (135.3);
1921     !!!parse-error (type => 'bogus comment',
1922     line => $self->{line_prev},
1923 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1924 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1925     ## Reconsume.
1926     $self->{ct} = {type => COMMENT_TOKEN,
1927 wakaba 1.12 data => $self->{kwd},
1928 wakaba 1.1 line => $self->{line_prev},
1929 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1930 wakaba 1.1 };
1931     redo A;
1932     }
1933     } elsif ($self->{state} == COMMENT_START_STATE) {
1934     if ($self->{nc} == 0x002D) { # -
1935     !!!cp (137);
1936     $self->{state} = COMMENT_START_DASH_STATE;
1937     !!!next-input-character;
1938     redo A;
1939     } elsif ($self->{nc} == 0x003E) { # >
1940     !!!parse-error (type => 'bogus comment');
1941 wakaba 1.13 if ($self->{in_subset}) {
1942     !!!cp (138.1);
1943     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944     } else {
1945     !!!cp (138);
1946     $self->{state} = DATA_STATE;
1947     $self->{s_kwd} = '';
1948     }
1949 wakaba 1.1 !!!next-input-character;
1950    
1951     !!!emit ($self->{ct}); # comment
1952    
1953     redo A;
1954     } elsif ($self->{nc} == -1) {
1955     !!!parse-error (type => 'unclosed comment');
1956 wakaba 1.13 if ($self->{in_subset}) {
1957     !!!cp (139.1);
1958     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1959     } else {
1960     !!!cp (139);
1961     $self->{state} = DATA_STATE;
1962     $self->{s_kwd} = '';
1963     }
1964 wakaba 1.1 ## reconsume
1965    
1966     !!!emit ($self->{ct}); # comment
1967    
1968     redo A;
1969     } else {
1970     !!!cp (140);
1971     $self->{ct}->{data} # comment
1972     .= chr ($self->{nc});
1973     $self->{state} = COMMENT_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     }
1977     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1978     if ($self->{nc} == 0x002D) { # -
1979     !!!cp (141);
1980     $self->{state} = COMMENT_END_STATE;
1981     !!!next-input-character;
1982     redo A;
1983     } elsif ($self->{nc} == 0x003E) { # >
1984     !!!parse-error (type => 'bogus comment');
1985 wakaba 1.13 if ($self->{in_subset}) {
1986     !!!cp (142.1);
1987     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1988     } else {
1989     !!!cp (142);
1990     $self->{state} = DATA_STATE;
1991     $self->{s_kwd} = '';
1992     }
1993 wakaba 1.1 !!!next-input-character;
1994    
1995     !!!emit ($self->{ct}); # comment
1996    
1997     redo A;
1998     } elsif ($self->{nc} == -1) {
1999     !!!parse-error (type => 'unclosed comment');
2000 wakaba 1.13 if ($self->{in_subset}) {
2001     !!!cp (143.1);
2002     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2003     } else {
2004     !!!cp (143);
2005     $self->{state} = DATA_STATE;
2006     $self->{s_kwd} = '';
2007     }
2008 wakaba 1.1 ## reconsume
2009    
2010     !!!emit ($self->{ct}); # comment
2011    
2012     redo A;
2013     } else {
2014     !!!cp (144);
2015     $self->{ct}->{data} # comment
2016     .= '-' . chr ($self->{nc});
2017     $self->{state} = COMMENT_STATE;
2018     !!!next-input-character;
2019     redo A;
2020     }
2021     } elsif ($self->{state} == COMMENT_STATE) {
2022 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2023    
2024 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2025     !!!cp (145);
2026     $self->{state} = COMMENT_END_DASH_STATE;
2027     !!!next-input-character;
2028     redo A;
2029     } elsif ($self->{nc} == -1) {
2030     !!!parse-error (type => 'unclosed comment');
2031 wakaba 1.13 if ($self->{in_subset}) {
2032     !!!cp (146.1);
2033     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2034     } else {
2035     !!!cp (146);
2036     $self->{state} = DATA_STATE;
2037     $self->{s_kwd} = '';
2038     }
2039 wakaba 1.1 ## reconsume
2040    
2041     !!!emit ($self->{ct}); # comment
2042    
2043     redo A;
2044     } else {
2045     !!!cp (147);
2046     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2047     $self->{read_until}->($self->{ct}->{data},
2048     q[-],
2049     length $self->{ct}->{data});
2050    
2051     ## Stay in the state
2052     !!!next-input-character;
2053     redo A;
2054     }
2055     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2056 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2057 wakaba 1.10
2058 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2059     !!!cp (148);
2060     $self->{state} = COMMENT_END_STATE;
2061     !!!next-input-character;
2062     redo A;
2063     } elsif ($self->{nc} == -1) {
2064     !!!parse-error (type => 'unclosed comment');
2065 wakaba 1.13 if ($self->{in_subset}) {
2066     !!!cp (149.1);
2067     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2068     } else {
2069     !!!cp (149);
2070     $self->{state} = DATA_STATE;
2071     $self->{s_kwd} = '';
2072     }
2073 wakaba 1.1 ## reconsume
2074    
2075     !!!emit ($self->{ct}); # comment
2076    
2077     redo A;
2078     } else {
2079     !!!cp (150);
2080     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2081     $self->{state} = COMMENT_STATE;
2082     !!!next-input-character;
2083     redo A;
2084     }
2085     } elsif ($self->{state} == COMMENT_END_STATE) {
2086 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2087    
2088 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2089 wakaba 1.13 if ($self->{in_subset}) {
2090     !!!cp (151.1);
2091     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2092     } else {
2093     !!!cp (151);
2094     $self->{state} = DATA_STATE;
2095     $self->{s_kwd} = '';
2096     }
2097 wakaba 1.1 !!!next-input-character;
2098    
2099     !!!emit ($self->{ct}); # comment
2100    
2101     redo A;
2102     } elsif ($self->{nc} == 0x002D) { # -
2103     !!!cp (152);
2104 wakaba 1.10 ## XML5: Not a parse error.
2105 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2106     line => $self->{line_prev},
2107     column => $self->{column_prev});
2108     $self->{ct}->{data} .= '-'; # comment
2109     ## Stay in the state
2110     !!!next-input-character;
2111     redo A;
2112     } elsif ($self->{nc} == -1) {
2113     !!!parse-error (type => 'unclosed comment');
2114 wakaba 1.13 if ($self->{in_subset}) {
2115     !!!cp (153.1);
2116     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2117     } else {
2118     !!!cp (153);
2119     $self->{state} = DATA_STATE;
2120     $self->{s_kwd} = '';
2121     }
2122 wakaba 1.1 ## reconsume
2123    
2124     !!!emit ($self->{ct}); # comment
2125    
2126     redo A;
2127     } else {
2128     !!!cp (154);
2129     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2130     $self->{state} = COMMENT_STATE;
2131     !!!next-input-character;
2132     redo A;
2133     }
2134     } elsif ($self->{state} == DOCTYPE_STATE) {
2135     if ($is_space->{$self->{nc}}) {
2136     !!!cp (155);
2137     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2138     !!!next-input-character;
2139     redo A;
2140 wakaba 1.28 } elsif ($self->{nc} == -1) {
2141     !!!cp (155.1);
2142     !!!parse-error (type => 'unclosed DOCTYPE');
2143     $self->{ct}->{quirks} = 1;
2144    
2145     $self->{state} = DATA_STATE;
2146     ## Reconsume.
2147     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2148    
2149     redo A;
2150 wakaba 1.1 } else {
2151     !!!cp (156);
2152 wakaba 1.28 ## XML5: Swith to the bogus comment state.
2153 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2154     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2155     ## reconsume
2156     redo A;
2157     }
2158     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2159 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2160    
2161 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2162     !!!cp (157);
2163     ## Stay in the state
2164     !!!next-input-character;
2165     redo A;
2166     } elsif ($self->{nc} == 0x003E) { # >
2167     !!!cp (158);
2168 wakaba 1.12 ## XML5: No parse error.
2169 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2170     $self->{state} = DATA_STATE;
2171 wakaba 1.5 $self->{s_kwd} = '';
2172 wakaba 1.1 !!!next-input-character;
2173    
2174     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2175    
2176     redo A;
2177 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2178     !!!cp (158.1);
2179     $self->{ct}->{name} # DOCTYPE
2180     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2181     delete $self->{ct}->{quirks};
2182     $self->{state} = DOCTYPE_NAME_STATE;
2183     !!!next-input-character;
2184     redo A;
2185 wakaba 1.1 } elsif ($self->{nc} == -1) {
2186     !!!cp (159);
2187     !!!parse-error (type => 'no DOCTYPE name');
2188     $self->{state} = DATA_STATE;
2189 wakaba 1.5 $self->{s_kwd} = '';
2190 wakaba 1.1 ## reconsume
2191    
2192     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2193    
2194     redo A;
2195 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2196     !!!cp (159.1);
2197     !!!parse-error (type => 'no DOCTYPE name');
2198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2199 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2200     $self->{in_subset} = 1;
2201 wakaba 1.12 !!!next-input-character;
2202 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2203 wakaba 1.12 redo A;
2204 wakaba 1.1 } else {
2205     !!!cp (160);
2206     $self->{ct}->{name} = chr $self->{nc};
2207     delete $self->{ct}->{quirks};
2208     $self->{state} = DOCTYPE_NAME_STATE;
2209     !!!next-input-character;
2210     redo A;
2211     }
2212     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2213 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2214    
2215     ## ISSUE: Redundant "First," in the spec.
2216    
2217 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2218     !!!cp (161);
2219     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2220     !!!next-input-character;
2221     redo A;
2222     } elsif ($self->{nc} == 0x003E) { # >
2223     !!!cp (162);
2224     $self->{state} = DATA_STATE;
2225 wakaba 1.5 $self->{s_kwd} = '';
2226 wakaba 1.1 !!!next-input-character;
2227    
2228     !!!emit ($self->{ct}); # DOCTYPE
2229    
2230     redo A;
2231 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2232     !!!cp (162.1);
2233     $self->{ct}->{name} # DOCTYPE
2234     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2235     delete $self->{ct}->{quirks};
2236     ## Stay in the state.
2237     !!!next-input-character;
2238     redo A;
2239 wakaba 1.1 } elsif ($self->{nc} == -1) {
2240     !!!cp (163);
2241     !!!parse-error (type => 'unclosed DOCTYPE');
2242     $self->{state} = DATA_STATE;
2243 wakaba 1.5 $self->{s_kwd} = '';
2244 wakaba 1.1 ## reconsume
2245    
2246     $self->{ct}->{quirks} = 1;
2247     !!!emit ($self->{ct}); # DOCTYPE
2248    
2249     redo A;
2250 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2251     !!!cp (163.1);
2252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2253 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2254     $self->{in_subset} = 1;
2255 wakaba 1.12 !!!next-input-character;
2256 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2257 wakaba 1.12 redo A;
2258 wakaba 1.1 } else {
2259     !!!cp (164);
2260 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2261     ## Stay in the state.
2262 wakaba 1.1 !!!next-input-character;
2263     redo A;
2264     }
2265     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2266 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2267     ## state", but implemented differently.
2268    
2269 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2270     !!!cp (165);
2271     ## Stay in the state
2272     !!!next-input-character;
2273     redo A;
2274     } elsif ($self->{nc} == 0x003E) { # >
2275 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2276     !!!cp (166);
2277     $self->{state} = DATA_STATE;
2278     $self->{s_kwd} = '';
2279     } else {
2280     !!!cp (166.1);
2281     !!!parse-error (type => 'no md def'); ## TODO: type
2282     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2283     }
2284    
2285 wakaba 1.1 !!!next-input-character;
2286 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2287 wakaba 1.1 redo A;
2288     } elsif ($self->{nc} == -1) {
2289 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2290     !!!cp (167);
2291     !!!parse-error (type => 'unclosed DOCTYPE');
2292     $self->{state} = DATA_STATE;
2293     $self->{s_kwd} = '';
2294     $self->{ct}->{quirks} = 1;
2295     } else {
2296     !!!cp (167.12);
2297     !!!parse-error (type => 'unclosed md'); ## TODO: type
2298     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2299     }
2300    
2301     ## Reconsume.
2302     !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2303 wakaba 1.1 redo A;
2304     } elsif ($self->{nc} == 0x0050 or # P
2305     $self->{nc} == 0x0070) { # p
2306 wakaba 1.12 !!!cp (167.1);
2307 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2308 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2309 wakaba 1.1 !!!next-input-character;
2310     redo A;
2311     } elsif ($self->{nc} == 0x0053 or # S
2312     $self->{nc} == 0x0073) { # s
2313 wakaba 1.12 !!!cp (167.2);
2314 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2315 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2316     !!!next-input-character;
2317     redo A;
2318 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
2319     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2320     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2321     !!!cp (167.21);
2322     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2323     $self->{ct}->{value} = ''; # ENTITY
2324     !!!next-input-character;
2325     redo A;
2326     } elsif ($self->{nc} == 0x0027 and # '
2327     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2328     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2329     !!!cp (167.22);
2330     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2331     $self->{ct}->{value} = ''; # ENTITY
2332     !!!next-input-character;
2333     redo A;
2334 wakaba 1.16 } elsif ($self->{is_xml} and
2335     $self->{ct}->{type} == DOCTYPE_TOKEN and
2336     $self->{nc} == 0x005B) { # [
2337 wakaba 1.12 !!!cp (167.3);
2338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2339     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2340 wakaba 1.13 $self->{in_subset} = 1;
2341 wakaba 1.1 !!!next-input-character;
2342 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2343 wakaba 1.1 redo A;
2344     } else {
2345 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2346    
2347     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2348     !!!cp (180);
2349     $self->{ct}->{quirks} = 1;
2350     $self->{state} = BOGUS_DOCTYPE_STATE;
2351     } else {
2352     !!!cp (180.1);
2353     $self->{state} = BOGUS_MD_STATE;
2354     }
2355 wakaba 1.1
2356     !!!next-input-character;
2357     redo A;
2358     }
2359     } elsif ($self->{state} == PUBLIC_STATE) {
2360     ## ASCII case-insensitive
2361     if ($self->{nc} == [
2362     undef,
2363     0x0055, # U
2364     0x0042, # B
2365     0x004C, # L
2366     0x0049, # I
2367 wakaba 1.12 ]->[length $self->{kwd}] or
2368 wakaba 1.1 $self->{nc} == [
2369     undef,
2370     0x0075, # u
2371     0x0062, # b
2372     0x006C, # l
2373     0x0069, # i
2374 wakaba 1.12 ]->[length $self->{kwd}]) {
2375 wakaba 1.1 !!!cp (175);
2376     ## Stay in the state.
2377 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2378 wakaba 1.1 !!!next-input-character;
2379     redo A;
2380 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2381 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2382     $self->{nc} == 0x0063)) { # c
2383 wakaba 1.12 if ($self->{is_xml} and
2384     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2385     !!!cp (168.1);
2386     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2387     text => 'PUBLIC',
2388     line => $self->{line_prev},
2389     column => $self->{column_prev} - 4);
2390     } else {
2391     !!!cp (168);
2392     }
2393 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2394     !!!next-input-character;
2395     redo A;
2396     } else {
2397 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2398 wakaba 1.1 line => $self->{line_prev},
2399 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2400 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2401     !!!cp (169);
2402     $self->{ct}->{quirks} = 1;
2403     $self->{state} = BOGUS_DOCTYPE_STATE;
2404     } else {
2405     !!!cp (169.1);
2406     $self->{state} = BOGUS_MD_STATE;
2407     }
2408 wakaba 1.1 ## Reconsume.
2409     redo A;
2410     }
2411     } elsif ($self->{state} == SYSTEM_STATE) {
2412     ## ASCII case-insensitive
2413     if ($self->{nc} == [
2414     undef,
2415     0x0059, # Y
2416     0x0053, # S
2417     0x0054, # T
2418     0x0045, # E
2419 wakaba 1.12 ]->[length $self->{kwd}] or
2420 wakaba 1.1 $self->{nc} == [
2421     undef,
2422     0x0079, # y
2423     0x0073, # s
2424     0x0074, # t
2425     0x0065, # e
2426 wakaba 1.12 ]->[length $self->{kwd}]) {
2427 wakaba 1.1 !!!cp (170);
2428     ## Stay in the state.
2429 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2430 wakaba 1.1 !!!next-input-character;
2431     redo A;
2432 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2433 wakaba 1.1 ($self->{nc} == 0x004D or # M
2434     $self->{nc} == 0x006D)) { # m
2435 wakaba 1.12 if ($self->{is_xml} and
2436     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2437     !!!cp (171.1);
2438     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2439     text => 'SYSTEM',
2440     line => $self->{line_prev},
2441     column => $self->{column_prev} - 4);
2442     } else {
2443     !!!cp (171);
2444     }
2445 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2446     !!!next-input-character;
2447     redo A;
2448     } else {
2449 wakaba 1.16 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2450 wakaba 1.1 line => $self->{line_prev},
2451 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2452 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2453     !!!cp (172);
2454     $self->{ct}->{quirks} = 1;
2455     $self->{state} = BOGUS_DOCTYPE_STATE;
2456     } else {
2457     !!!cp (172.1);
2458     $self->{state} = BOGUS_MD_STATE;
2459     }
2460 wakaba 1.1 ## Reconsume.
2461     redo A;
2462     }
2463     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2464     if ($is_space->{$self->{nc}}) {
2465     !!!cp (181);
2466     ## Stay in the state
2467     !!!next-input-character;
2468     redo A;
2469     } elsif ($self->{nc} eq 0x0022) { # "
2470     !!!cp (182);
2471     $self->{ct}->{pubid} = ''; # DOCTYPE
2472     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2473     !!!next-input-character;
2474     redo A;
2475     } elsif ($self->{nc} eq 0x0027) { # '
2476     !!!cp (183);
2477     $self->{ct}->{pubid} = ''; # DOCTYPE
2478     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2479     !!!next-input-character;
2480     redo A;
2481     } elsif ($self->{nc} eq 0x003E) { # >
2482     !!!parse-error (type => 'no PUBLIC literal');
2483 wakaba 1.16
2484     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2485     !!!cp (184);
2486     $self->{state} = DATA_STATE;
2487     $self->{s_kwd} = '';
2488     $self->{ct}->{quirks} = 1;
2489     } else {
2490     !!!cp (184.1);
2491     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2492     }
2493    
2494 wakaba 1.1 !!!next-input-character;
2495 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2496 wakaba 1.1 redo A;
2497     } elsif ($self->{nc} == -1) {
2498 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499     !!!cp (185);
2500     !!!parse-error (type => 'unclosed DOCTYPE');
2501     $self->{state} = DATA_STATE;
2502     $self->{s_kwd} = '';
2503     $self->{ct}->{quirks} = 1;
2504     } else {
2505     !!!cp (185.1);
2506     !!!parse-error (type => 'unclosed md'); ## TODO: type
2507     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2508     }
2509    
2510 wakaba 1.1 ## reconsume
2511     !!!emit ($self->{ct}); # DOCTYPE
2512     redo A;
2513 wakaba 1.16 } elsif ($self->{is_xml} and
2514     $self->{ct}->{type} == DOCTYPE_TOKEN and
2515     $self->{nc} == 0x005B) { # [
2516 wakaba 1.12 !!!cp (186.1);
2517     !!!parse-error (type => 'no PUBLIC literal');
2518     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2519     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2520 wakaba 1.13 $self->{in_subset} = 1;
2521 wakaba 1.12 !!!next-input-character;
2522 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2523 wakaba 1.12 redo A;
2524 wakaba 1.1 } else {
2525     !!!parse-error (type => 'string after PUBLIC');
2526    
2527 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2528     !!!cp (186);
2529     $self->{ct}->{quirks} = 1;
2530     $self->{state} = BOGUS_DOCTYPE_STATE;
2531     } else {
2532     !!!cp (186.2);
2533     $self->{state} = BOGUS_MD_STATE;
2534     }
2535    
2536 wakaba 1.1 !!!next-input-character;
2537     redo A;
2538     }
2539     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2540     if ($self->{nc} == 0x0022) { # "
2541     !!!cp (187);
2542     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2543     !!!next-input-character;
2544     redo A;
2545     } elsif ($self->{nc} == 0x003E) { # >
2546     !!!parse-error (type => 'unclosed PUBLIC literal');
2547    
2548 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2549     !!!cp (188);
2550     $self->{state} = DATA_STATE;
2551     $self->{s_kwd} = '';
2552     $self->{ct}->{quirks} = 1;
2553     } else {
2554     !!!cp (188.1);
2555     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2556     }
2557    
2558 wakaba 1.1 !!!next-input-character;
2559 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2560 wakaba 1.1 redo A;
2561     } elsif ($self->{nc} == -1) {
2562     !!!parse-error (type => 'unclosed PUBLIC literal');
2563    
2564 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2565     !!!cp (189);
2566     $self->{state} = DATA_STATE;
2567     $self->{s_kwd} = '';
2568     $self->{ct}->{quirks} = 1;
2569     } else {
2570     !!!cp (189.1);
2571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2572     }
2573    
2574     ## Reconsume.
2575 wakaba 1.1 !!!emit ($self->{ct}); # DOCTYPE
2576     redo A;
2577     } else {
2578     !!!cp (190);
2579 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2580 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2581     length $self->{ct}->{pubid});
2582    
2583     ## Stay in the state
2584     !!!next-input-character;
2585     redo A;
2586     }
2587     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2588     if ($self->{nc} == 0x0027) { # '
2589     !!!cp (191);
2590     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2591     !!!next-input-character;
2592     redo A;
2593     } elsif ($self->{nc} == 0x003E) { # >
2594     !!!parse-error (type => 'unclosed PUBLIC literal');
2595    
2596 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2597     !!!cp (192);
2598     $self->{state} = DATA_STATE;
2599     $self->{s_kwd} = '';
2600     $self->{ct}->{quirks} = 1;
2601     } else {
2602     !!!cp (192.1);
2603     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2604     }
2605    
2606 wakaba 1.1 !!!next-input-character;
2607 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2608 wakaba 1.1 redo A;
2609     } elsif ($self->{nc} == -1) {
2610     !!!parse-error (type => 'unclosed PUBLIC literal');
2611    
2612 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2613     !!!cp (193);
2614     $self->{state} = DATA_STATE;
2615     $self->{s_kwd} = '';
2616     $self->{ct}->{quirks} = 1;
2617     } else {
2618     !!!cp (193.1);
2619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2620     }
2621    
2622 wakaba 1.1 ## reconsume
2623 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2624 wakaba 1.1 redo A;
2625     } else {
2626     !!!cp (194);
2627 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2628 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2629     length $self->{ct}->{pubid});
2630    
2631     ## Stay in the state
2632     !!!next-input-character;
2633     redo A;
2634     }
2635     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2636     if ($is_space->{$self->{nc}}) {
2637     !!!cp (195);
2638     ## Stay in the state
2639     !!!next-input-character;
2640     redo A;
2641     } elsif ($self->{nc} == 0x0022) { # "
2642     !!!cp (196);
2643 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2644 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2645     !!!next-input-character;
2646     redo A;
2647     } elsif ($self->{nc} == 0x0027) { # '
2648     !!!cp (197);
2649 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2650 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2651     !!!next-input-character;
2652     redo A;
2653     } elsif ($self->{nc} == 0x003E) { # >
2654 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2655     if ($self->{is_xml}) {
2656     !!!cp (198.1);
2657     !!!parse-error (type => 'no SYSTEM literal');
2658     } else {
2659     !!!cp (198);
2660     }
2661     $self->{state} = DATA_STATE;
2662     $self->{s_kwd} = '';
2663 wakaba 1.12 } else {
2664 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2665     !!!cp (198.2);
2666     } else {
2667     !!!cp (198.3);
2668     !!!parse-error (type => 'no SYSTEM literal');
2669     }
2670     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671 wakaba 1.12 }
2672 wakaba 1.16
2673 wakaba 1.1 !!!next-input-character;
2674 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2675 wakaba 1.1 redo A;
2676     } elsif ($self->{nc} == -1) {
2677 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2678     !!!cp (199);
2679     !!!parse-error (type => 'unclosed DOCTYPE');
2680    
2681     $self->{state} = DATA_STATE;
2682     $self->{s_kwd} = '';
2683     $self->{ct}->{quirks} = 1;
2684     } else {
2685     !!!parse-error (type => 'unclosed md'); ## TODO: type
2686     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2687     }
2688    
2689 wakaba 1.1 ## reconsume
2690 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2691 wakaba 1.1 redo A;
2692 wakaba 1.16 } elsif ($self->{is_xml} and
2693     $self->{ct}->{type} == DOCTYPE_TOKEN and
2694     $self->{nc} == 0x005B) { # [
2695 wakaba 1.12 !!!cp (200.1);
2696     !!!parse-error (type => 'no SYSTEM literal');
2697     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2699 wakaba 1.13 $self->{in_subset} = 1;
2700 wakaba 1.12 !!!next-input-character;
2701 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2702 wakaba 1.12 redo A;
2703 wakaba 1.1 } else {
2704     !!!parse-error (type => 'string after PUBLIC literal');
2705    
2706 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2707     !!!cp (200);
2708     $self->{ct}->{quirks} = 1;
2709     $self->{state} = BOGUS_DOCTYPE_STATE;
2710     } else {
2711     !!!cp (200.2);
2712     $self->{state} = BOGUS_MD_STATE;
2713     }
2714    
2715 wakaba 1.1 !!!next-input-character;
2716     redo A;
2717     }
2718     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2719     if ($is_space->{$self->{nc}}) {
2720     !!!cp (201);
2721     ## Stay in the state
2722     !!!next-input-character;
2723     redo A;
2724     } elsif ($self->{nc} == 0x0022) { # "
2725     !!!cp (202);
2726     $self->{ct}->{sysid} = ''; # DOCTYPE
2727     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2728     !!!next-input-character;
2729     redo A;
2730     } elsif ($self->{nc} == 0x0027) { # '
2731     !!!cp (203);
2732     $self->{ct}->{sysid} = ''; # DOCTYPE
2733     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2734     !!!next-input-character;
2735     redo A;
2736     } elsif ($self->{nc} == 0x003E) { # >
2737     !!!parse-error (type => 'no SYSTEM literal');
2738     !!!next-input-character;
2739    
2740 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2741     !!!cp (204);
2742     $self->{state} = DATA_STATE;
2743     $self->{s_kwd} = '';
2744     $self->{ct}->{quirks} = 1;
2745     } else {
2746     !!!cp (204.1);
2747     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2748     }
2749 wakaba 1.1
2750 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2751 wakaba 1.1 redo A;
2752     } elsif ($self->{nc} == -1) {
2753 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2754     !!!cp (205);
2755     !!!parse-error (type => 'unclosed DOCTYPE');
2756     $self->{state} = DATA_STATE;
2757     $self->{s_kwd} = '';
2758     $self->{ct}->{quirks} = 1;
2759     } else {
2760     !!!cp (205.1);
2761     !!!parse-error (type => 'unclosed md'); ## TODO: type
2762     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2763     }
2764    
2765 wakaba 1.1 ## reconsume
2766 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2767 wakaba 1.1 redo A;
2768 wakaba 1.16 } elsif ($self->{is_xml} and
2769     $self->{ct}->{type} == DOCTYPE_TOKEN and
2770     $self->{nc} == 0x005B) { # [
2771 wakaba 1.12 !!!cp (206.1);
2772     !!!parse-error (type => 'no SYSTEM literal');
2773    
2774     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2775     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2776 wakaba 1.13 $self->{in_subset} = 1;
2777 wakaba 1.12 !!!next-input-character;
2778 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2779 wakaba 1.12 redo A;
2780 wakaba 1.1 } else {
2781     !!!parse-error (type => 'string after SYSTEM');
2782    
2783 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2784     !!!cp (206);
2785     $self->{ct}->{quirks} = 1;
2786     $self->{state} = BOGUS_DOCTYPE_STATE;
2787     } else {
2788     !!!cp (206.2);
2789     $self->{state} = BOGUS_MD_STATE;
2790     }
2791    
2792 wakaba 1.1 !!!next-input-character;
2793     redo A;
2794     }
2795     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2796     if ($self->{nc} == 0x0022) { # "
2797     !!!cp (207);
2798     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2799     !!!next-input-character;
2800     redo A;
2801 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2802 wakaba 1.1 !!!parse-error (type => 'unclosed SYSTEM literal');
2803    
2804 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2805     !!!cp (208);
2806     $self->{state} = DATA_STATE;
2807     $self->{s_kwd} = '';
2808     $self->{ct}->{quirks} = 1;
2809     } else {
2810     !!!cp (208.1);
2811     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2812     }
2813    
2814 wakaba 1.1 !!!next-input-character;
2815 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2816 wakaba 1.1 redo A;
2817     } elsif ($self->{nc} == -1) {
2818     !!!parse-error (type => 'unclosed SYSTEM literal');
2819    
2820 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2821     !!!cp (209);
2822     $self->{state} = DATA_STATE;
2823     $self->{s_kwd} = '';
2824     $self->{ct}->{quirks} = 1;
2825     } else {
2826     !!!cp (209.1);
2827     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2828     }
2829    
2830 wakaba 1.1 ## reconsume
2831 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2832 wakaba 1.1 redo A;
2833     } else {
2834     !!!cp (210);
2835 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2836 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2837     length $self->{ct}->{sysid});
2838    
2839     ## Stay in the state
2840     !!!next-input-character;
2841     redo A;
2842     }
2843     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2844     if ($self->{nc} == 0x0027) { # '
2845     !!!cp (211);
2846     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2847     !!!next-input-character;
2848     redo A;
2849 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2850 wakaba 1.1 !!!cp (212);
2851     !!!parse-error (type => 'unclosed SYSTEM literal');
2852    
2853     $self->{state} = DATA_STATE;
2854 wakaba 1.5 $self->{s_kwd} = '';
2855 wakaba 1.1 !!!next-input-character;
2856    
2857     $self->{ct}->{quirks} = 1;
2858     !!!emit ($self->{ct}); # DOCTYPE
2859    
2860     redo A;
2861     } elsif ($self->{nc} == -1) {
2862     !!!parse-error (type => 'unclosed SYSTEM literal');
2863    
2864 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2865     !!!cp (213);
2866     $self->{state} = DATA_STATE;
2867     $self->{s_kwd} = '';
2868     $self->{ct}->{quirks} = 1;
2869     } else {
2870     !!!cp (213.1);
2871     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2872     }
2873    
2874 wakaba 1.1 ## reconsume
2875 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2876 wakaba 1.1 redo A;
2877     } else {
2878     !!!cp (214);
2879 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2880 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2881     length $self->{ct}->{sysid});
2882    
2883     ## Stay in the state
2884     !!!next-input-character;
2885     redo A;
2886     }
2887     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2888     if ($is_space->{$self->{nc}}) {
2889 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2890     !!!cp (215.1);
2891     $self->{state} = BEFORE_NDATA_STATE;
2892     } else {
2893     !!!cp (215);
2894     ## Stay in the state
2895     }
2896 wakaba 1.1 !!!next-input-character;
2897     redo A;
2898     } elsif ($self->{nc} == 0x003E) { # >
2899 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900     !!!cp (216);
2901     $self->{state} = DATA_STATE;
2902     $self->{s_kwd} = '';
2903     } else {
2904     !!!cp (216.1);
2905     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2906     }
2907    
2908 wakaba 1.1 !!!next-input-character;
2909 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2910 wakaba 1.1 redo A;
2911 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2912     ($self->{nc} == 0x004E or # N
2913     $self->{nc} == 0x006E)) { # n
2914     !!!cp (216.2);
2915     !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2916     $self->{state} = NDATA_STATE;
2917     $self->{kwd} = chr $self->{nc};
2918     !!!next-input-character;
2919     redo A;
2920 wakaba 1.1 } elsif ($self->{nc} == -1) {
2921 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2922     !!!cp (217);
2923     !!!parse-error (type => 'unclosed DOCTYPE');
2924     $self->{state} = DATA_STATE;
2925     $self->{s_kwd} = '';
2926     $self->{ct}->{quirks} = 1;
2927     } else {
2928     !!!cp (217.1);
2929     !!!parse-error (type => 'unclosed md'); ## TODO: type
2930     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2931     }
2932    
2933 wakaba 1.1 ## reconsume
2934 wakaba 1.16 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2935 wakaba 1.1 redo A;
2936 wakaba 1.16 } elsif ($self->{is_xml} and
2937     $self->{ct}->{type} == DOCTYPE_TOKEN and
2938     $self->{nc} == 0x005B) { # [
2939 wakaba 1.12 !!!cp (218.1);
2940     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2942 wakaba 1.13 $self->{in_subset} = 1;
2943 wakaba 1.12 !!!next-input-character;
2944 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2945 wakaba 1.12 redo A;
2946 wakaba 1.1 } else {
2947     !!!parse-error (type => 'string after SYSTEM literal');
2948    
2949 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2950     !!!cp (218);
2951     #$self->{ct}->{quirks} = 1;
2952     $self->{state} = BOGUS_DOCTYPE_STATE;
2953     } else {
2954     !!!cp (218.2);
2955     $self->{state} = BOGUS_MD_STATE;
2956     }
2957    
2958 wakaba 1.1 !!!next-input-character;
2959     redo A;
2960     }
2961 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2962     if ($is_space->{$self->{nc}}) {
2963     !!!cp (218.3);
2964     ## Stay in the state.
2965     !!!next-input-character;
2966     redo A;
2967     } elsif ($self->{nc} == 0x003E) { # >
2968     !!!cp (218.4);
2969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2970     !!!next-input-character;
2971     !!!emit ($self->{ct}); # ENTITY
2972     redo A;
2973     } elsif ($self->{nc} == 0x004E or # N
2974     $self->{nc} == 0x006E) { # n
2975     !!!cp (218.5);
2976     $self->{state} = NDATA_STATE;
2977     $self->{kwd} = chr $self->{nc};
2978     !!!next-input-character;
2979     redo A;
2980     } elsif ($self->{nc} == -1) {
2981     !!!cp (218.6);
2982     !!!parse-error (type => 'unclosed md'); ## TODO: type
2983     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2984     ## reconsume
2985     !!!emit ($self->{ct}); # ENTITY
2986     redo A;
2987     } else {
2988     !!!cp (218.7);
2989     !!!parse-error (type => 'string after SYSTEM literal');
2990     $self->{state} = BOGUS_MD_STATE;
2991     !!!next-input-character;
2992     redo A;
2993     }
2994 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2995     if ($self->{nc} == 0x003E) { # >
2996     !!!cp (219);
2997     $self->{state} = DATA_STATE;
2998 wakaba 1.5 $self->{s_kwd} = '';
2999 wakaba 1.1 !!!next-input-character;
3000    
3001     !!!emit ($self->{ct}); # DOCTYPE
3002    
3003     redo A;
3004 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3005 wakaba 1.13 !!!cp (220.1);
3006     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3007     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3008     $self->{in_subset} = 1;
3009     !!!next-input-character;
3010     !!!emit ($self->{ct}); # DOCTYPE
3011     redo A;
3012 wakaba 1.1 } elsif ($self->{nc} == -1) {
3013     !!!cp (220);
3014     $self->{state} = DATA_STATE;
3015 wakaba 1.5 $self->{s_kwd} = '';
3016 wakaba 1.1 ## reconsume
3017    
3018     !!!emit ($self->{ct}); # DOCTYPE
3019    
3020     redo A;
3021     } else {
3022     !!!cp (221);
3023     my $s = '';
3024 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
3025 wakaba 1.1
3026     ## Stay in the state
3027     !!!next-input-character;
3028     redo A;
3029     }
3030     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3031     ## NOTE: "CDATA section state" in the state is jointly implemented
3032     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3033     ## and |CDATA_SECTION_MSE2_STATE|.
3034 wakaba 1.10
3035     ## XML5: "CDATA state".
3036 wakaba 1.1
3037     if ($self->{nc} == 0x005D) { # ]
3038     !!!cp (221.1);
3039     $self->{state} = CDATA_SECTION_MSE1_STATE;
3040     !!!next-input-character;
3041     redo A;
3042     } elsif ($self->{nc} == -1) {
3043 wakaba 1.6 if ($self->{is_xml}) {
3044 wakaba 1.8 !!!cp (221.11);
3045 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
3046 wakaba 1.8 } else {
3047     !!!cp (221.12);
3048 wakaba 1.6 }
3049    
3050 wakaba 1.1 $self->{state} = DATA_STATE;
3051 wakaba 1.5 $self->{s_kwd} = '';
3052 wakaba 1.10 ## Reconsume.
3053 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3054     !!!cp (221.2);
3055     !!!emit ($self->{ct}); # character
3056     } else {
3057     !!!cp (221.3);
3058     ## No token to emit. $self->{ct} is discarded.
3059     }
3060     redo A;
3061     } else {
3062     !!!cp (221.4);
3063     $self->{ct}->{data} .= chr $self->{nc};
3064     $self->{read_until}->($self->{ct}->{data},
3065     q<]>,
3066     length $self->{ct}->{data});
3067    
3068     ## Stay in the state.
3069     !!!next-input-character;
3070     redo A;
3071     }
3072    
3073     ## ISSUE: "text tokens" in spec.
3074     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3075 wakaba 1.10 ## XML5: "CDATA bracket state".
3076    
3077 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3078     !!!cp (221.5);
3079     $self->{state} = CDATA_SECTION_MSE2_STATE;
3080     !!!next-input-character;
3081     redo A;
3082     } else {
3083     !!!cp (221.6);
3084 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3085 wakaba 1.1 $self->{ct}->{data} .= ']';
3086 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3087 wakaba 1.1 ## Reconsume.
3088     redo A;
3089     }
3090     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3091 wakaba 1.10 ## XML5: "CDATA end state".
3092    
3093 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3094     $self->{state} = DATA_STATE;
3095 wakaba 1.5 $self->{s_kwd} = '';
3096 wakaba 1.1 !!!next-input-character;
3097     if (length $self->{ct}->{data}) { # character
3098     !!!cp (221.7);
3099     !!!emit ($self->{ct}); # character
3100     } else {
3101     !!!cp (221.8);
3102     ## No token to emit. $self->{ct} is discarded.
3103     }
3104     redo A;
3105     } elsif ($self->{nc} == 0x005D) { # ]
3106     !!!cp (221.9); # character
3107     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3108     ## Stay in the state.
3109     !!!next-input-character;
3110     redo A;
3111     } else {
3112     !!!cp (221.11);
3113     $self->{ct}->{data} .= ']]'; # character
3114     $self->{state} = CDATA_SECTION_STATE;
3115 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3116 wakaba 1.1 redo A;
3117     }
3118     } elsif ($self->{state} == ENTITY_STATE) {
3119     if ($is_space->{$self->{nc}} or
3120     {
3121     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3122     $self->{entity_add} => 1,
3123     }->{$self->{nc}}) {
3124 wakaba 1.22 if ($self->{is_xml}) {
3125     !!!cp (1001.1);
3126     !!!parse-error (type => 'bare ero',
3127     line => $self->{line_prev},
3128     column => $self->{column_prev}
3129     + ($self->{nc} == -1 ? 1 : 0));
3130     } else {
3131     !!!cp (1001);
3132     ## No error
3133     }
3134 wakaba 1.1 ## Don't consume
3135     ## Return nothing.
3136     #
3137     } elsif ($self->{nc} == 0x0023) { # #
3138     !!!cp (999);
3139     $self->{state} = ENTITY_HASH_STATE;
3140 wakaba 1.12 $self->{kwd} = '#';
3141 wakaba 1.1 !!!next-input-character;
3142     redo A;
3143 wakaba 1.22 } elsif ($self->{is_xml} or
3144     (0x0041 <= $self->{nc} and
3145 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
3146     (0x0061 <= $self->{nc} and
3147     $self->{nc} <= 0x007A)) { # a..z
3148     !!!cp (998);
3149     require Whatpm::_NamedEntityList;
3150     $self->{state} = ENTITY_NAME_STATE;
3151 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3152     $self->{entity__value} = $self->{kwd};
3153 wakaba 1.1 $self->{entity__match} = 0;
3154     !!!next-input-character;
3155     redo A;
3156     } else {
3157     !!!cp (1027);
3158     !!!parse-error (type => 'bare ero');
3159     ## Return nothing.
3160     #
3161     }
3162    
3163     ## NOTE: No character is consumed by the "consume a character
3164     ## reference" algorithm. In other word, there is an "&" character
3165     ## that does not introduce a character reference, which would be
3166     ## appended to the parent element or the attribute value in later
3167     ## process of the tokenizer.
3168    
3169     if ($self->{prev_state} == DATA_STATE) {
3170     !!!cp (997);
3171     $self->{state} = $self->{prev_state};
3172 wakaba 1.5 $self->{s_kwd} = '';
3173 wakaba 1.1 ## Reconsume.
3174     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3175     line => $self->{line_prev},
3176     column => $self->{column_prev},
3177     });
3178     redo A;
3179     } else {
3180     !!!cp (996);
3181     $self->{ca}->{value} .= '&';
3182     $self->{state} = $self->{prev_state};
3183 wakaba 1.5 $self->{s_kwd} = '';
3184 wakaba 1.1 ## Reconsume.
3185     redo A;
3186     }
3187     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3188 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
3189 wakaba 1.1 !!!cp (995);
3190     $self->{state} = HEXREF_X_STATE;
3191 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3192 wakaba 1.1 !!!next-input-character;
3193     redo A;
3194 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
3195     !!!cp (995.1);
3196     if ($self->{is_xml}) {
3197     !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3198     }
3199     $self->{state} = HEXREF_X_STATE;
3200     $self->{kwd} .= chr $self->{nc};
3201     !!!next-input-character;
3202     redo A;
3203 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
3204     $self->{nc} <= 0x0039) { # 0..9
3205     !!!cp (994);
3206     $self->{state} = NCR_NUM_STATE;
3207 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
3208 wakaba 1.1 !!!next-input-character;
3209     redo A;
3210     } else {
3211     !!!parse-error (type => 'bare nero',
3212     line => $self->{line_prev},
3213     column => $self->{column_prev} - 1);
3214    
3215     ## NOTE: According to the spec algorithm, nothing is returned,
3216     ## and then "&#" is appended to the parent element or the attribute
3217     ## value in the later processing.
3218    
3219     if ($self->{prev_state} == DATA_STATE) {
3220     !!!cp (1019);
3221     $self->{state} = $self->{prev_state};
3222 wakaba 1.5 $self->{s_kwd} = '';
3223 wakaba 1.1 ## Reconsume.
3224     !!!emit ({type => CHARACTER_TOKEN,
3225     data => '&#',
3226     line => $self->{line_prev},
3227     column => $self->{column_prev} - 1,
3228     });
3229     redo A;
3230     } else {
3231     !!!cp (993);
3232     $self->{ca}->{value} .= '&#';
3233     $self->{state} = $self->{prev_state};
3234 wakaba 1.5 $self->{s_kwd} = '';
3235 wakaba 1.1 ## Reconsume.
3236     redo A;
3237     }
3238     }
3239     } elsif ($self->{state} == NCR_NUM_STATE) {
3240     if (0x0030 <= $self->{nc} and
3241     $self->{nc} <= 0x0039) { # 0..9
3242     !!!cp (1012);
3243 wakaba 1.12 $self->{kwd} *= 10;
3244     $self->{kwd} += $self->{nc} - 0x0030;
3245 wakaba 1.1
3246     ## Stay in the state.
3247     !!!next-input-character;
3248     redo A;
3249     } elsif ($self->{nc} == 0x003B) { # ;
3250     !!!cp (1013);
3251     !!!next-input-character;
3252     #
3253     } else {
3254     !!!cp (1014);
3255     !!!parse-error (type => 'no refc');
3256     ## Reconsume.
3257     #
3258     }
3259    
3260 wakaba 1.12 my $code = $self->{kwd};
3261 wakaba 1.1 my $l = $self->{line_prev};
3262     my $c = $self->{column_prev};
3263 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3264     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3265     ($self->{is_xml} and $code == 0x0000)) {
3266 wakaba 1.1 !!!cp (1015);
3267     !!!parse-error (type => 'invalid character reference',
3268     text => (sprintf 'U+%04X', $code),
3269     line => $l, column => $c);
3270     $code = $charref_map->{$code};
3271     } elsif ($code > 0x10FFFF) {
3272     !!!cp (1016);
3273     !!!parse-error (type => 'invalid character reference',
3274     text => (sprintf 'U-%08X', $code),
3275     line => $l, column => $c);
3276     $code = 0xFFFD;
3277     }
3278    
3279     if ($self->{prev_state} == DATA_STATE) {
3280     !!!cp (992);
3281     $self->{state} = $self->{prev_state};
3282 wakaba 1.5 $self->{s_kwd} = '';
3283 wakaba 1.1 ## Reconsume.
3284     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3285 wakaba 1.7 has_reference => 1,
3286 wakaba 1.1 line => $l, column => $c,
3287     });
3288     redo A;
3289     } else {
3290     !!!cp (991);
3291     $self->{ca}->{value} .= chr $code;
3292     $self->{ca}->{has_reference} = 1;
3293     $self->{state} = $self->{prev_state};
3294 wakaba 1.5 $self->{s_kwd} = '';
3295 wakaba 1.1 ## Reconsume.
3296     redo A;
3297     }
3298     } elsif ($self->{state} == HEXREF_X_STATE) {
3299     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3300     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3301     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3302     # 0..9, A..F, a..f
3303     !!!cp (990);
3304     $self->{state} = HEXREF_HEX_STATE;
3305 wakaba 1.12 $self->{kwd} = 0;
3306 wakaba 1.1 ## Reconsume.
3307     redo A;
3308     } else {
3309     !!!parse-error (type => 'bare hcro',
3310     line => $self->{line_prev},
3311     column => $self->{column_prev} - 2);
3312    
3313     ## NOTE: According to the spec algorithm, nothing is returned,
3314     ## and then "&#" followed by "X" or "x" is appended to the parent
3315     ## element or the attribute value in the later processing.
3316    
3317     if ($self->{prev_state} == DATA_STATE) {
3318     !!!cp (1005);
3319     $self->{state} = $self->{prev_state};
3320 wakaba 1.5 $self->{s_kwd} = '';
3321 wakaba 1.1 ## Reconsume.
3322     !!!emit ({type => CHARACTER_TOKEN,
3323 wakaba 1.12 data => '&' . $self->{kwd},
3324 wakaba 1.1 line => $self->{line_prev},
3325 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
3326 wakaba 1.1 });
3327     redo A;
3328     } else {
3329     !!!cp (989);
3330 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
3331 wakaba 1.1 $self->{state} = $self->{prev_state};
3332 wakaba 1.5 $self->{s_kwd} = '';
3333 wakaba 1.1 ## Reconsume.
3334     redo A;
3335     }
3336     }
3337     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3338     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3339     # 0..9
3340     !!!cp (1002);
3341 wakaba 1.12 $self->{kwd} *= 0x10;
3342     $self->{kwd} += $self->{nc} - 0x0030;
3343 wakaba 1.1 ## Stay in the state.
3344     !!!next-input-character;
3345     redo A;
3346     } elsif (0x0061 <= $self->{nc} and
3347     $self->{nc} <= 0x0066) { # a..f
3348     !!!cp (1003);
3349 wakaba 1.12 $self->{kwd} *= 0x10;
3350     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3351 wakaba 1.1 ## Stay in the state.
3352     !!!next-input-character;
3353     redo A;
3354     } elsif (0x0041 <= $self->{nc} and
3355     $self->{nc} <= 0x0046) { # A..F
3356     !!!cp (1004);
3357 wakaba 1.12 $self->{kwd} *= 0x10;
3358     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3359 wakaba 1.1 ## Stay in the state.
3360     !!!next-input-character;
3361     redo A;
3362     } elsif ($self->{nc} == 0x003B) { # ;
3363     !!!cp (1006);
3364     !!!next-input-character;
3365     #
3366     } else {
3367     !!!cp (1007);
3368     !!!parse-error (type => 'no refc',
3369     line => $self->{line},
3370     column => $self->{column});
3371     ## Reconsume.
3372     #
3373     }
3374    
3375 wakaba 1.12 my $code = $self->{kwd};
3376 wakaba 1.1 my $l = $self->{line_prev};
3377     my $c = $self->{column_prev};
3378 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
3379     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3380     ($self->{is_xml} and $code == 0x0000)) {
3381 wakaba 1.1 !!!cp (1008);
3382     !!!parse-error (type => 'invalid character reference',
3383     text => (sprintf 'U+%04X', $code),
3384     line => $l, column => $c);
3385     $code = $charref_map->{$code};
3386     } elsif ($code > 0x10FFFF) {
3387     !!!cp (1009);
3388     !!!parse-error (type => 'invalid character reference',
3389     text => (sprintf 'U-%08X', $code),
3390     line => $l, column => $c);
3391     $code = 0xFFFD;
3392     }
3393    
3394     if ($self->{prev_state} == DATA_STATE) {
3395     !!!cp (988);
3396     $self->{state} = $self->{prev_state};
3397 wakaba 1.5 $self->{s_kwd} = '';
3398 wakaba 1.1 ## Reconsume.
3399     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3400 wakaba 1.7 has_reference => 1,
3401 wakaba 1.1 line => $l, column => $c,
3402     });
3403     redo A;
3404     } else {
3405     !!!cp (987);
3406     $self->{ca}->{value} .= chr $code;
3407     $self->{ca}->{has_reference} = 1;
3408     $self->{state} = $self->{prev_state};
3409 wakaba 1.5 $self->{s_kwd} = '';
3410 wakaba 1.1 ## Reconsume.
3411     redo A;
3412     }
3413     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3414 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
3415     $self->{nc} <= 0x005A) or # x
3416     (0x0061 <= $self->{nc} and # a
3417     $self->{nc} <= 0x007A) or # z
3418     (0x0030 <= $self->{nc} and # 0
3419     $self->{nc} <= 0x0039) or # 9
3420 wakaba 1.22 $self->{nc} == 0x003B or # ;
3421     ($self->{is_xml} and
3422     not ($is_space->{$self->{nc}} or
3423     {
3424     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3425     $self->{entity_add} => 1,
3426     }->{$self->{nc}}))) {
3427 wakaba 1.1 our $EntityChar;
3428 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3429 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
3430     $self->{ge}->{$self->{kwd}}) {
3431 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3432 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
3433     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3434     !!!cp (1020.1);
3435     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3436     } else {
3437     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3438     !!!cp (1020.2);
3439     !!!parse-error (type => 'unparsed entity', ## TODO: type
3440     value => $self->{kwd});
3441     } else {
3442     !!!cp (1020.3);
3443     }
3444     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3445     }
3446     } else {
3447     if ($self->{is_xml}) {
3448     !!!cp (1020.4);
3449     !!!parse-error (type => 'entity not declared', ## TODO: type
3450     value => $self->{kwd},
3451     level => {
3452     'amp;' => $self->{level}->{warn},
3453     'quot;' => $self->{level}->{warn},
3454     'lt;' => $self->{level}->{warn},
3455     'gt;' => $self->{level}->{warn},
3456     'apos;' => $self->{level}->{warn},
3457     }->{$self->{kwd}} ||
3458     $self->{level}->{must});
3459     } else {
3460     !!!cp (1020);
3461     }
3462     $self->{entity__value} = $EntityChar->{$self->{kwd}};
3463     }
3464 wakaba 1.1 $self->{entity__match} = 1;
3465     !!!next-input-character;
3466     #
3467     } else {
3468     !!!cp (1021);
3469 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3470 wakaba 1.1 $self->{entity__match} = -1;
3471     ## Stay in the state.
3472     !!!next-input-character;
3473     redo A;
3474     }
3475     } else {
3476     !!!cp (1022);
3477     $self->{entity__value} .= chr $self->{nc};
3478     $self->{entity__match} *= 2;
3479     ## Stay in the state.
3480     !!!next-input-character;
3481     redo A;
3482     }
3483     }
3484    
3485     my $data;
3486     my $has_ref;
3487     if ($self->{entity__match} > 0) {
3488     !!!cp (1023);
3489     $data = $self->{entity__value};
3490     $has_ref = 1;
3491     #
3492     } elsif ($self->{entity__match} < 0) {
3493     !!!parse-error (type => 'no refc');
3494     if ($self->{prev_state} != DATA_STATE and # in attribute
3495     $self->{entity__match} < -1) {
3496     !!!cp (1024);
3497 wakaba 1.12 $data = '&' . $self->{kwd};
3498 wakaba 1.1 #
3499     } else {
3500     !!!cp (1025);
3501     $data = $self->{entity__value};
3502     $has_ref = 1;
3503     #
3504     }
3505     } else {
3506     !!!cp (1026);
3507     !!!parse-error (type => 'bare ero',
3508     line => $self->{line_prev},
3509 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3510     $data = '&' . $self->{kwd};
3511 wakaba 1.1 #
3512     }
3513    
3514     ## NOTE: In these cases, when a character reference is found,
3515     ## it is consumed and a character token is returned, or, otherwise,
3516     ## nothing is consumed and returned, according to the spec algorithm.
3517     ## In this implementation, anything that has been examined by the
3518     ## tokenizer is appended to the parent element or the attribute value
3519     ## as string, either literal string when no character reference or
3520     ## entity-replaced string otherwise, in this stage, since any characters
3521     ## that would not be consumed are appended in the data state or in an
3522     ## appropriate attribute value state anyway.
3523    
3524     if ($self->{prev_state} == DATA_STATE) {
3525     !!!cp (986);
3526     $self->{state} = $self->{prev_state};
3527 wakaba 1.5 $self->{s_kwd} = '';
3528 wakaba 1.1 ## Reconsume.
3529     !!!emit ({type => CHARACTER_TOKEN,
3530     data => $data,
3531 wakaba 1.7 has_reference => $has_ref,
3532 wakaba 1.1 line => $self->{line_prev},
3533 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3534 wakaba 1.1 });
3535     redo A;
3536     } else {
3537     !!!cp (985);
3538     $self->{ca}->{value} .= $data;
3539     $self->{ca}->{has_reference} = 1 if $has_ref;
3540     $self->{state} = $self->{prev_state};
3541 wakaba 1.5 $self->{s_kwd} = '';
3542 wakaba 1.1 ## Reconsume.
3543     redo A;
3544     }
3545 wakaba 1.8
3546     ## XML-only states
3547    
3548     } elsif ($self->{state} == PI_STATE) {
3549 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3550    
3551 wakaba 1.8 if ($is_space->{$self->{nc}} or
3552 wakaba 1.14 $self->{nc} == 0x003F or # ?
3553 wakaba 1.8 $self->{nc} == -1) {
3554 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3555     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3556     ## "DOCTYPE pi state": Parse error, switch to the "data
3557     ## state".
3558 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3559     line => $self->{line_prev},
3560     column => $self->{column_prev}
3561     - 1 * ($self->{nc} != -1));
3562     $self->{state} = BOGUS_COMMENT_STATE;
3563     ## Reconsume.
3564     $self->{ct} = {type => COMMENT_TOKEN,
3565     data => '?',
3566     line => $self->{line_prev},
3567     column => $self->{column_prev}
3568     - 1 * ($self->{nc} != -1),
3569     };
3570     redo A;
3571     } else {
3572 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3573 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3574     target => chr $self->{nc},
3575     data => '',
3576     line => $self->{line_prev},
3577     column => $self->{column_prev} - 1,
3578     };
3579     $self->{state} = PI_TARGET_STATE;
3580     !!!next-input-character;
3581     redo A;
3582     }
3583     } elsif ($self->{state} == PI_TARGET_STATE) {
3584     if ($is_space->{$self->{nc}}) {
3585     $self->{state} = PI_TARGET_AFTER_STATE;
3586     !!!next-input-character;
3587     redo A;
3588     } elsif ($self->{nc} == -1) {
3589     !!!parse-error (type => 'no pic'); ## TODO: type
3590 wakaba 1.13 if ($self->{in_subset}) {
3591     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3592     } else {
3593     $self->{state} = DATA_STATE;
3594     $self->{s_kwd} = '';
3595     }
3596 wakaba 1.8 ## Reconsume.
3597     !!!emit ($self->{ct}); # pi
3598     redo A;
3599     } elsif ($self->{nc} == 0x003F) { # ?
3600     $self->{state} = PI_AFTER_STATE;
3601     !!!next-input-character;
3602     redo A;
3603     } else {
3604     ## XML5: typo ("tag name" -> "target")
3605     $self->{ct}->{target} .= chr $self->{nc}; # pi
3606     !!!next-input-character;
3607     redo A;
3608     }
3609     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3610     if ($is_space->{$self->{nc}}) {
3611     ## Stay in the state.
3612     !!!next-input-character;
3613     redo A;
3614     } else {
3615     $self->{state} = PI_DATA_STATE;
3616     ## Reprocess.
3617     redo A;
3618     }
3619     } elsif ($self->{state} == PI_DATA_STATE) {
3620     if ($self->{nc} == 0x003F) { # ?
3621     $self->{state} = PI_DATA_AFTER_STATE;
3622     !!!next-input-character;
3623     redo A;
3624     } elsif ($self->{nc} == -1) {
3625     !!!parse-error (type => 'no pic'); ## TODO: type
3626 wakaba 1.13 if ($self->{in_subset}) {
3627 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3628 wakaba 1.13 } else {
3629     $self->{state} = DATA_STATE;
3630     $self->{s_kwd} = '';
3631     }
3632 wakaba 1.8 ## Reprocess.
3633     !!!emit ($self->{ct}); # pi
3634     redo A;
3635     } else {
3636     $self->{ct}->{data} .= chr $self->{nc}; # pi
3637     $self->{read_until}->($self->{ct}->{data}, q[?],
3638     length $self->{ct}->{data});
3639     ## Stay in the state.
3640     !!!next-input-character;
3641     ## Reprocess.
3642     redo A;
3643     }
3644     } elsif ($self->{state} == PI_AFTER_STATE) {
3645 wakaba 1.14 ## XML5: Part of "Pi after state".
3646    
3647 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3648 wakaba 1.13 if ($self->{in_subset}) {
3649     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3650     } else {
3651     $self->{state} = DATA_STATE;
3652     $self->{s_kwd} = '';
3653     }
3654 wakaba 1.8 !!!next-input-character;
3655     !!!emit ($self->{ct}); # pi
3656     redo A;
3657     } elsif ($self->{nc} == 0x003F) { # ?
3658     !!!parse-error (type => 'no s after target', ## TODO: type
3659     line => $self->{line_prev},
3660     column => $self->{column_prev}); ## XML5: no error
3661     $self->{ct}->{data} .= '?';
3662     $self->{state} = PI_DATA_AFTER_STATE;
3663     !!!next-input-character;
3664     redo A;
3665     } else {
3666     !!!parse-error (type => 'no s after target', ## TODO: type
3667     line => $self->{line_prev},
3668     column => $self->{column_prev}
3669     + 1 * ($self->{nc} == -1)); ## XML5: no error
3670     $self->{ct}->{data} .= '?'; ## XML5: not appended
3671     $self->{state} = PI_DATA_STATE;
3672     ## Reprocess.
3673     redo A;
3674     }
3675     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3676 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3677    
3678 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3679 wakaba 1.13 if ($self->{in_subset}) {
3680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3681     } else {
3682     $self->{state} = DATA_STATE;
3683     $self->{s_kwd} = '';
3684     }
3685 wakaba 1.8 !!!next-input-character;
3686     !!!emit ($self->{ct}); # pi
3687     redo A;
3688     } elsif ($self->{nc} == 0x003F) { # ?
3689     $self->{ct}->{data} .= '?';
3690     ## Stay in the state.
3691     !!!next-input-character;
3692     redo A;
3693     } else {
3694     $self->{ct}->{data} .= '?'; ## XML5: not appended
3695     $self->{state} = PI_DATA_STATE;
3696     ## Reprocess.
3697     redo A;
3698     }
3699 wakaba 1.12
3700     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3701     if ($self->{nc} == 0x003C) { # <
3702 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3703 wakaba 1.12 !!!next-input-character;
3704     redo A;
3705     } elsif ($self->{nc} == 0x0025) { # %
3706     ## XML5: Not defined yet.
3707    
3708     ## TODO:
3709 wakaba 1.24
3710     if (not $self->{stop_processing} and
3711     not $self->{document}->xml_standalone) {
3712     !!!parse-error (type => 'stop processing', ## TODO: type
3713     level => $self->{level}->{info});
3714     $self->{stop_processing} = 1;
3715     }
3716    
3717 wakaba 1.12 !!!next-input-character;
3718     redo A;
3719     } elsif ($self->{nc} == 0x005D) { # ]
3720 wakaba 1.13 delete $self->{in_subset};
3721 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3722     !!!next-input-character;
3723     redo A;
3724     } elsif ($is_space->{$self->{nc}}) {
3725     ## Stay in the state.
3726     !!!next-input-character;
3727     redo A;
3728     } elsif ($self->{nc} == -1) {
3729     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3730 wakaba 1.13 delete $self->{in_subset};
3731 wakaba 1.12 $self->{state} = DATA_STATE;
3732     $self->{s_kwd} = '';
3733     ## Reconsume.
3734 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3735 wakaba 1.12 redo A;
3736     } else {
3737     unless ($self->{internal_subset_tainted}) {
3738     ## XML5: No parse error.
3739     !!!parse-error (type => 'string in internal subset');
3740     $self->{internal_subset_tainted} = 1;
3741     }
3742     ## Stay in the state.
3743     !!!next-input-character;
3744     redo A;
3745     }
3746     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3747     if ($self->{nc} == 0x003E) { # >
3748     $self->{state} = DATA_STATE;
3749     $self->{s_kwd} = '';
3750     !!!next-input-character;
3751 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3752 wakaba 1.12 redo A;
3753     } elsif ($self->{nc} == -1) {
3754     !!!parse-error (type => 'unclosed DOCTYPE');
3755     $self->{state} = DATA_STATE;
3756     $self->{s_kwd} = '';
3757     ## Reconsume.
3758 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3759 wakaba 1.12 redo A;
3760     } else {
3761     ## XML5: No parse error and stay in the state.
3762     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3763    
3764 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3765     !!!next-input-character;
3766     redo A;
3767     }
3768     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3769     if ($self->{nc} == 0x003E) { # >
3770     $self->{state} = DATA_STATE;
3771     $self->{s_kwd} = '';
3772     !!!next-input-character;
3773     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3774     redo A;
3775     } elsif ($self->{nc} == -1) {
3776     $self->{state} = DATA_STATE;
3777     $self->{s_kwd} = '';
3778     ## Reconsume.
3779     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3780     redo A;
3781     } else {
3782     ## Stay in the state.
3783     !!!next-input-character;
3784     redo A;
3785     }
3786     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3787     if ($self->{nc} == 0x0021) { # !
3788 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3789 wakaba 1.13 !!!next-input-character;
3790     redo A;
3791     } elsif ($self->{nc} == 0x003F) { # ?
3792     $self->{state} = PI_STATE;
3793     !!!next-input-character;
3794     redo A;
3795     } elsif ($self->{nc} == -1) {
3796     !!!parse-error (type => 'bare stago');
3797     $self->{state} = DATA_STATE;
3798     $self->{s_kwd} = '';
3799     ## Reconsume.
3800     redo A;
3801     } else {
3802     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3803     line => $self->{line_prev},
3804     column => $self->{column_prev});
3805     $self->{state} = BOGUS_COMMENT_STATE;
3806     $self->{ct} = {type => COMMENT_TOKEN,
3807     data => '',
3808     }; ## NOTE: Will be discarded.
3809 wakaba 1.12 !!!next-input-character;
3810     redo A;
3811     }
3812 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3813     ## XML5: "DOCTYPE markup declaration state".
3814    
3815     if ($self->{nc} == 0x002D) { # -
3816     $self->{state} = MD_HYPHEN_STATE;
3817     !!!next-input-character;
3818     redo A;
3819 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
3820     $self->{nc} == 0x0065) { # e
3821 wakaba 1.14 $self->{state} = MD_E_STATE;
3822     $self->{kwd} = chr $self->{nc};
3823     !!!next-input-character;
3824     redo A;
3825 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
3826     $self->{nc} == 0x0061) { # a
3827 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
3828     $self->{kwd} = chr $self->{nc};
3829     !!!next-input-character;
3830     redo A;
3831 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
3832     $self->{nc} == 0x006E) { # n
3833 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
3834     $self->{kwd} = chr $self->{nc};
3835     !!!next-input-character;
3836     redo A;
3837     } else {
3838     #
3839     }
3840    
3841     ## XML5: No parse error.
3842     !!!parse-error (type => 'bogus comment',
3843     line => $self->{line_prev},
3844     column => $self->{column_prev} - 1);
3845     ## Reconsume.
3846     $self->{state} = BOGUS_COMMENT_STATE;
3847     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3848     redo A;
3849     } elsif ($self->{state} == MD_E_STATE) {
3850 wakaba 1.17 if ($self->{nc} == 0x004E or # N
3851     $self->{nc} == 0x006E) { # n
3852 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
3853     $self->{kwd} .= chr $self->{nc};
3854     !!!next-input-character;
3855     redo A;
3856 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
3857     $self->{nc} == 0x006C) { # l
3858 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
3859     $self->{state} = MD_ELEMENT_STATE;
3860     $self->{kwd} .= chr $self->{nc};
3861     !!!next-input-character;
3862     redo A;
3863     } else {
3864     ## XML5: No parse error.
3865     !!!parse-error (type => 'bogus comment',
3866     line => $self->{line_prev},
3867     column => $self->{column_prev} - 2
3868     + 1 * ($self->{nc} == -1));
3869     ## Reconsume.
3870     $self->{state} = BOGUS_COMMENT_STATE;
3871     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3872     redo A;
3873     }
3874     } elsif ($self->{state} == MD_ENTITY_STATE) {
3875 wakaba 1.17 if ($self->{nc} == [
3876     undef,
3877     undef,
3878     0x0054, # T
3879     0x0049, # I
3880     0x0054, # T
3881     ]->[length $self->{kwd}] or
3882     $self->{nc} == [
3883     undef,
3884     undef,
3885     0x0074, # t
3886     0x0069, # i
3887     0x0074, # t
3888     ]->[length $self->{kwd}]) {
3889 wakaba 1.14 ## Stay in the state.
3890     $self->{kwd} .= chr $self->{nc};
3891     !!!next-input-character;
3892     redo A;
3893 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
3894     ($self->{nc} == 0x0059 or # Y
3895     $self->{nc} == 0x0079)) { # y
3896     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3897     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3898     text => 'ENTITY',
3899     line => $self->{line_prev},
3900     column => $self->{column_prev} - 4);
3901     }
3902     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3903 wakaba 1.14 line => $self->{line_prev},
3904     column => $self->{column_prev} - 6};
3905     $self->{state} = DOCTYPE_MD_STATE;
3906     !!!next-input-character;
3907     redo A;
3908     } else {
3909     !!!parse-error (type => 'bogus comment',
3910     line => $self->{line_prev},
3911     column => $self->{column_prev} - 1
3912     - (length $self->{kwd})
3913     + 1 * ($self->{nc} == -1));
3914     $self->{state} = BOGUS_COMMENT_STATE;
3915     ## Reconsume.
3916     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3917     redo A;
3918     }
3919     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3920 wakaba 1.17 if ($self->{nc} == [
3921     undef,
3922     undef,
3923     0x0045, # E
3924     0x004D, # M
3925     0x0045, # E
3926     0x004E, # N
3927     ]->[length $self->{kwd}] or
3928     $self->{nc} == [
3929     undef,
3930     undef,
3931     0x0065, # e
3932     0x006D, # m
3933     0x0065, # e
3934     0x006E, # n
3935     ]->[length $self->{kwd}]) {
3936 wakaba 1.14 ## Stay in the state.
3937     $self->{kwd} .= chr $self->{nc};
3938     !!!next-input-character;
3939     redo A;
3940 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3941     ($self->{nc} == 0x0054 or # T
3942     $self->{nc} == 0x0074)) { # t
3943     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3944     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3945     text => 'ELEMENT',
3946     line => $self->{line_prev},
3947     column => $self->{column_prev} - 5);
3948     }
3949 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3950     line => $self->{line_prev},
3951 wakaba 1.23 column => $self->{column_prev} - 7};
3952 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
3953     !!!next-input-character;
3954     redo A;
3955     } else {
3956     !!!parse-error (type => 'bogus comment',
3957     line => $self->{line_prev},
3958     column => $self->{column_prev} - 1
3959     - (length $self->{kwd})
3960     + 1 * ($self->{nc} == -1));
3961     $self->{state} = BOGUS_COMMENT_STATE;
3962     ## Reconsume.
3963     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3964     redo A;
3965     }
3966     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3967 wakaba 1.17 if ($self->{nc} == [
3968     undef,
3969     0x0054, # T
3970     0x0054, # T
3971     0x004C, # L
3972     0x0049, # I
3973     0x0053, # S
3974     ]->[length $self->{kwd}] or
3975     $self->{nc} == [
3976     undef,
3977     0x0074, # t
3978     0x0074, # t
3979     0x006C, # l
3980     0x0069, # i
3981     0x0073, # s
3982     ]->[length $self->{kwd}]) {
3983 wakaba 1.14 ## Stay in the state.
3984     $self->{kwd} .= chr $self->{nc};
3985     !!!next-input-character;
3986     redo A;
3987 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
3988     ($self->{nc} == 0x0054 or # T
3989     $self->{nc} == 0x0074)) { # t
3990     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3991     !!!parse-error (type => 'lowercase keyword', ## TODO: type
3992     text => 'ATTLIST',
3993     line => $self->{line_prev},
3994     column => $self->{column_prev} - 5);
3995     }
3996 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3997 wakaba 1.15 attrdefs => [],
3998 wakaba 1.14 line => $self->{line_prev},
3999 wakaba 1.23 column => $self->{column_prev} - 7};
4000 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4001     !!!next-input-character;
4002     redo A;
4003     } else {
4004     !!!parse-error (type => 'bogus comment',
4005     line => $self->{line_prev},
4006     column => $self->{column_prev} - 1
4007     - (length $self->{kwd})
4008     + 1 * ($self->{nc} == -1));
4009     $self->{state} = BOGUS_COMMENT_STATE;
4010     ## Reconsume.
4011     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4012     redo A;
4013     }
4014     } elsif ($self->{state} == MD_NOTATION_STATE) {
4015 wakaba 1.17 if ($self->{nc} == [
4016     undef,
4017     0x004F, # O
4018     0x0054, # T
4019     0x0041, # A
4020     0x0054, # T
4021     0x0049, # I
4022     0x004F, # O
4023     ]->[length $self->{kwd}] or
4024     $self->{nc} == [
4025     undef,
4026     0x006F, # o
4027     0x0074, # t
4028     0x0061, # a
4029     0x0074, # t
4030     0x0069, # i
4031     0x006F, # o
4032     ]->[length $self->{kwd}]) {
4033 wakaba 1.14 ## Stay in the state.
4034     $self->{kwd} .= chr $self->{nc};
4035     !!!next-input-character;
4036     redo A;
4037 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
4038     ($self->{nc} == 0x004E or # N
4039     $self->{nc} == 0x006E)) { # n
4040     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4041     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4042     text => 'NOTATION',
4043     line => $self->{line_prev},
4044     column => $self->{column_prev} - 6);
4045     }
4046 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4047     line => $self->{line_prev},
4048 wakaba 1.23 column => $self->{column_prev} - 8};
4049 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
4050     !!!next-input-character;
4051     redo A;
4052     } else {
4053     !!!parse-error (type => 'bogus comment',
4054     line => $self->{line_prev},
4055     column => $self->{column_prev} - 1
4056     - (length $self->{kwd})
4057     + 1 * ($self->{nc} == -1));
4058     $self->{state} = BOGUS_COMMENT_STATE;
4059     ## Reconsume.
4060     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4061     redo A;
4062     }
4063     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4064     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4065     ## "DOCTYPE NOTATION state".
4066    
4067     if ($is_space->{$self->{nc}}) {
4068     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4069     $self->{state} = BEFORE_MD_NAME_STATE;
4070     !!!next-input-character;
4071     redo A;
4072     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4073     $self->{nc} == 0x0025) { # %
4074     ## XML5: Switch to the "DOCTYPE bogus comment state".
4075     !!!parse-error (type => 'no space before md name'); ## TODO: type
4076     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4077     !!!next-input-character;
4078     redo A;
4079     } elsif ($self->{nc} == -1) {
4080     !!!parse-error (type => 'unclosed md'); ## TODO: type
4081     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4082     ## Reconsume.
4083     redo A;
4084     } elsif ($self->{nc} == 0x003E) { # >
4085     ## XML5: Switch to the "DOCTYPE bogus comment state".
4086     !!!parse-error (type => 'no md name'); ## TODO: type
4087     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4088     !!!next-input-character;
4089     redo A;
4090     } else {
4091     ## XML5: Switch to the "DOCTYPE bogus comment state".
4092     !!!parse-error (type => 'no space before md name'); ## TODO: type
4093     $self->{state} = BEFORE_MD_NAME_STATE;
4094     redo A;
4095     }
4096     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4097     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4098     ## before state", "DOCTYPE ATTLIST name before state".
4099    
4100     if ($is_space->{$self->{nc}}) {
4101     ## Stay in the state.
4102     !!!next-input-character;
4103     redo A;
4104     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4105     $self->{nc} == 0x0025) { # %
4106     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4107     !!!next-input-character;
4108     redo A;
4109     } elsif ($self->{nc} == 0x003E) { # >
4110     ## XML5: Same as "Anything else".
4111     !!!parse-error (type => 'no md name'); ## TODO: type
4112     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4113     !!!next-input-character;
4114     redo A;
4115     } elsif ($self->{nc} == -1) {
4116     !!!parse-error (type => 'unclosed md'); ## TODO: type
4117     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4118     ## Reconsume.
4119     redo A;
4120     } else {
4121     ## XML5: [ATTLIST] Not defined yet.
4122     $self->{ct}->{name} .= chr $self->{nc};
4123     $self->{state} = MD_NAME_STATE;
4124     !!!next-input-character;
4125     redo A;
4126     }
4127     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4128     if ($is_space->{$self->{nc}}) {
4129     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4130     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4131     $self->{state} = BEFORE_MD_NAME_STATE;
4132     !!!next-input-character;
4133     redo A;
4134     } elsif ($self->{nc} == 0x003E) { # >
4135     ## XML5: Same as "Anything else".
4136     !!!parse-error (type => 'no md name'); ## TODO: type
4137     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4138     !!!next-input-character;
4139     redo A;
4140     } elsif ($self->{nc} == -1) {
4141     !!!parse-error (type => 'unclosed md');
4142     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4143     ## Reconsume.
4144     redo A;
4145     } else {
4146     ## XML5: No parse error.
4147     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4148     $self->{state} = BOGUS_COMMENT_STATE;
4149     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4150     ## Reconsume.
4151     redo A;
4152     }
4153     } elsif ($self->{state} == MD_NAME_STATE) {
4154     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4155    
4156     if ($is_space->{$self->{nc}}) {
4157 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4158     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4159     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4160 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4161 wakaba 1.16 } else { # ENTITY/NOTATION
4162     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4163     }
4164 wakaba 1.14 !!!next-input-character;
4165     redo A;
4166     } elsif ($self->{nc} == 0x003E) { # >
4167     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4168     #
4169     } else {
4170 wakaba 1.16 !!!parse-error (type => 'no md def'); ## TODO: type
4171 wakaba 1.14 }
4172     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4173     !!!next-input-character;
4174     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4175     redo A;
4176     } elsif ($self->{nc} == -1) {
4177     ## XML5: [ATTLIST] No parse error.
4178     !!!parse-error (type => 'unclosed md');
4179     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4180     ## Reconsume.
4181     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4182     redo A;
4183     } else {
4184     ## XML5: [ATTLIST] Not defined yet.
4185     $self->{ct}->{name} .= chr $self->{nc};
4186     ## Stay in the state.
4187     !!!next-input-character;
4188     redo A;
4189     }
4190     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4191     if ($is_space->{$self->{nc}}) {
4192     ## Stay in the state.
4193     !!!next-input-character;
4194     redo A;
4195     } elsif ($self->{nc} == 0x003E) { # >
4196     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197     !!!next-input-character;
4198     !!!emit ($self->{ct}); # ATTLIST
4199     redo A;
4200     } elsif ($self->{nc} == -1) {
4201     ## XML5: No parse error.
4202     !!!parse-error (type => 'unclosed md'); ## TODO: type
4203     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4204 wakaba 1.15 !!!emit ($self->{ct});
4205     redo A;
4206     } else {
4207     ## XML5: Not defined yet.
4208     $self->{ca} = {name => chr ($self->{nc}), # attrdef
4209     tokens => [],
4210     line => $self->{line}, column => $self->{column}};
4211     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4212     !!!next-input-character;
4213     redo A;
4214     }
4215     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4216     if ($is_space->{$self->{nc}}) {
4217     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4218     !!!next-input-character;
4219     redo A;
4220     } elsif ($self->{nc} == 0x003E) { # >
4221     ## XML5: Same as "anything else".
4222     !!!parse-error (type => 'no attr type'); ## TODO: type
4223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224     !!!next-input-character;
4225     !!!emit ($self->{ct}); # ATTLIST
4226     redo A;
4227     } elsif ($self->{nc} == 0x0028) { # (
4228     ## XML5: Same as "anything else".
4229     !!!parse-error (type => 'no space before paren'); ## TODO: type
4230     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4231     !!!next-input-character;
4232     redo A;
4233     } elsif ($self->{nc} == -1) {
4234     ## XML5: No parse error.
4235     !!!parse-error (type => 'unclosed md'); ## TODO: type
4236     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4237     !!!next-input-character;
4238     !!!emit ($self->{ct}); # ATTLIST
4239     redo A;
4240     } else {
4241     ## XML5: Not defined yet.
4242     $self->{ca}->{name} .= chr $self->{nc};
4243     ## Stay in the state.
4244     !!!next-input-character;
4245     redo A;
4246     }
4247     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4248     if ($is_space->{$self->{nc}}) {
4249     ## Stay in the state.
4250     !!!next-input-character;
4251     redo A;
4252     } elsif ($self->{nc} == 0x003E) { # >
4253     ## XML5: Same as "anything else".
4254     !!!parse-error (type => 'no attr type'); ## TODO: type
4255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4256     !!!next-input-character;
4257     !!!emit ($self->{ct}); # ATTLIST
4258     redo A;
4259     } elsif ($self->{nc} == 0x0028) { # (
4260     ## XML5: Same as "anything else".
4261     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4262     !!!next-input-character;
4263     redo A;
4264     } elsif ($self->{nc} == -1) {
4265     ## XML5: No parse error.
4266     !!!parse-error (type => 'unclosed md'); ## TODO: type
4267     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4268     !!!next-input-character;
4269     !!!emit ($self->{ct});
4270 wakaba 1.14 redo A;
4271     } else {
4272     ## XML5: Not defined yet.
4273 wakaba 1.15 $self->{ca}->{type} = chr $self->{nc};
4274     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4275     !!!next-input-character;
4276     redo A;
4277     }
4278     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4279     if ($is_space->{$self->{nc}}) {
4280     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4281     !!!next-input-character;
4282     redo A;
4283     } elsif ($self->{nc} == 0x0023) { # #
4284     ## XML5: Same as "anything else".
4285     !!!parse-error (type => 'no space before default value'); ## TODO: type
4286     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4287     !!!next-input-character;
4288     redo A;
4289     } elsif ($self->{nc} == 0x0022) { # "
4290     ## XML5: Same as "anything else".
4291     !!!parse-error (type => 'no space before default value'); ## TODO: type
4292     $self->{ca}->{value} = '';
4293     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4294     !!!next-input-character;
4295     redo A;
4296     } elsif ($self->{nc} == 0x0027) { # '
4297     ## XML5: Same as "anything else".
4298     !!!parse-error (type => 'no space before default value'); ## TODO: type
4299     $self->{ca}->{value} = '';
4300     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4301     !!!next-input-character;
4302     redo A;
4303     } elsif ($self->{nc} == 0x003E) { # >
4304     ## XML5: Same as "anything else".
4305     !!!parse-error (type => 'no attr default'); ## TODO: type
4306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4307     !!!next-input-character;
4308     !!!emit ($self->{ct}); # ATTLIST
4309     redo A;
4310     } elsif ($self->{nc} == 0x0028) { # (
4311     ## XML5: Same as "anything else".
4312     !!!parse-error (type => 'no space before paren'); ## TODO: type
4313     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4314     !!!next-input-character;
4315     redo A;
4316     } elsif ($self->{nc} == -1) {
4317     ## XML5: No parse error.
4318     !!!parse-error (type => 'unclosed md'); ## TODO: type
4319     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4320     !!!next-input-character;
4321     !!!emit ($self->{ct});
4322     redo A;
4323     } else {
4324     ## XML5: Not defined yet.
4325     $self->{ca}->{type} .= chr $self->{nc};
4326     ## Stay in the state.
4327     !!!next-input-character;
4328     redo A;
4329     }
4330     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4331     if ($is_space->{$self->{nc}}) {
4332     ## Stay in the state.
4333     !!!next-input-character;
4334     redo A;
4335     } elsif ($self->{nc} == 0x0028) { # (
4336     ## XML5: Same as "anything else".
4337     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338     !!!next-input-character;
4339     redo A;
4340     } elsif ($self->{nc} == 0x0023) { # #
4341     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4342     !!!next-input-character;
4343     redo A;
4344     } elsif ($self->{nc} == 0x0022) { # "
4345     ## XML5: Same as "anything else".
4346     $self->{ca}->{value} = '';
4347     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4348     !!!next-input-character;
4349     redo A;
4350     } elsif ($self->{nc} == 0x0027) { # '
4351     ## XML5: Same as "anything else".
4352     $self->{ca}->{value} = '';
4353     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4354     !!!next-input-character;
4355     redo A;
4356     } elsif ($self->{nc} == 0x003E) { # >
4357     ## XML5: Same as "anything else".
4358     !!!parse-error (type => 'no attr default'); ## TODO: type
4359     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4360     !!!next-input-character;
4361     !!!emit ($self->{ct}); # ATTLIST
4362     redo A;
4363     } elsif ($self->{nc} == -1) {
4364     ## XML5: No parse error.
4365     !!!parse-error (type => 'unclosed md'); ## TODO: type
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4367     !!!next-input-character;
4368     !!!emit ($self->{ct});
4369     redo A;
4370     } else {
4371     ## XML5: Switch to the "DOCTYPE bogus comment state".
4372     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4373     $self->{ca}->{value} = '';
4374     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4375     ## Reconsume.
4376     redo A;
4377     }
4378     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4379     if ($is_space->{$self->{nc}}) {
4380     ## Stay in the state.
4381     !!!next-input-character;
4382     redo A;
4383     } elsif ($self->{nc} == 0x007C) { # |
4384     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4385     ## Stay in the state.
4386     !!!next-input-character;
4387     redo A;
4388     } elsif ($self->{nc} == 0x0029) { # )
4389     !!!parse-error (type => 'empty allowed token'); ## TODO: type
4390     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4391     !!!next-input-character;
4392     redo A;
4393     } elsif ($self->{nc} == 0x003E) { # >
4394     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4395     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4396     !!!next-input-character;
4397     !!!emit ($self->{ct}); # ATTLIST
4398     redo A;
4399     } elsif ($self->{nc} == -1) {
4400     ## XML5: No parse error.
4401     !!!parse-error (type => 'unclosed md'); ## TODO: type
4402     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4403     !!!next-input-character;
4404     !!!emit ($self->{ct});
4405     redo A;
4406     } else {
4407     push @{$self->{ca}->{tokens}}, chr $self->{nc};
4408     $self->{state} = ALLOWED_TOKEN_STATE;
4409     !!!next-input-character;
4410     redo A;
4411     }
4412     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4413     if ($is_space->{$self->{nc}}) {
4414     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4415     !!!next-input-character;
4416     redo A;
4417     } elsif ($self->{nc} == 0x007C) { # |
4418     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4419     !!!next-input-character;
4420     redo A;
4421     } elsif ($self->{nc} == 0x0029) { # )
4422     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4423     !!!next-input-character;
4424     redo A;
4425     } elsif ($self->{nc} == 0x003E) { # >
4426     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428     !!!next-input-character;
4429     !!!emit ($self->{ct}); # ATTLIST
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432     ## XML5: No parse error.
4433     !!!parse-error (type => 'unclosed md'); ## TODO: type
4434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4435     !!!next-input-character;
4436     !!!emit ($self->{ct});
4437     redo A;
4438     } else {
4439     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4440     ## Stay in the state.
4441     !!!next-input-character;
4442     redo A;
4443     }
4444     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4445     if ($is_space->{$self->{nc}}) {
4446     ## Stay in the state.
4447     !!!next-input-character;
4448     redo A;
4449     } elsif ($self->{nc} == 0x007C) { # |
4450     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4451     !!!next-input-character;
4452     redo A;
4453     } elsif ($self->{nc} == 0x0029) { # )
4454     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4455     !!!next-input-character;
4456     redo A;
4457     } elsif ($self->{nc} == 0x003E) { # >
4458     !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4459     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4460     !!!next-input-character;
4461     !!!emit ($self->{ct}); # ATTLIST
4462     redo A;
4463     } elsif ($self->{nc} == -1) {
4464     ## XML5: No parse error.
4465     !!!parse-error (type => 'unclosed md'); ## TODO: type
4466     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4467     !!!next-input-character;
4468     !!!emit ($self->{ct});
4469     redo A;
4470     } else {
4471     !!!parse-error (type => 'space in allowed token', ## TODO: type
4472     line => $self->{line_prev},
4473     column => $self->{column_prev});
4474     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4475     $self->{state} = ALLOWED_TOKEN_STATE;
4476     !!!next-input-character;
4477     redo A;
4478     }
4479     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4480     if ($is_space->{$self->{nc}}) {
4481     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4482     !!!next-input-character;
4483     redo A;
4484     } elsif ($self->{nc} == 0x0023) { # #
4485     !!!parse-error (type => 'no space before default value'); ## TODO: type
4486     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4487     !!!next-input-character;
4488     redo A;
4489     } elsif ($self->{nc} == 0x0022) { # "
4490     !!!parse-error (type => 'no space before default value'); ## TODO: type
4491     $self->{ca}->{value} = '';
4492     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4493     !!!next-input-character;
4494     redo A;
4495     } elsif ($self->{nc} == 0x0027) { # '
4496     !!!parse-error (type => 'no space before default value'); ## TODO: type
4497     $self->{ca}->{value} = '';
4498     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499     !!!next-input-character;
4500     redo A;
4501     } elsif ($self->{nc} == 0x003E) { # >
4502     !!!parse-error (type => 'no attr default'); ## TODO: type
4503     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4504     !!!next-input-character;
4505     !!!emit ($self->{ct}); # ATTLIST
4506     redo A;
4507     } elsif ($self->{nc} == -1) {
4508     !!!parse-error (type => 'unclosed md'); ## TODO: type
4509     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4510     !!!next-input-character;
4511     !!!emit ($self->{ct});
4512     redo A;
4513     } else {
4514     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4515     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4516     ## Reconsume.
4517     redo A;
4518     }
4519     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4520     if ($is_space->{$self->{nc}}) {
4521     ## Stay in the state.
4522     !!!next-input-character;
4523     redo A;
4524     } elsif ($self->{nc} == 0x0023) { # #
4525     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4526     !!!next-input-character;
4527     redo A;
4528     } elsif ($self->{nc} == 0x0022) { # "
4529     $self->{ca}->{value} = '';
4530     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4531     !!!next-input-character;
4532     redo A;
4533     } elsif ($self->{nc} == 0x0027) { # '
4534     $self->{ca}->{value} = '';
4535     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4536     !!!next-input-character;
4537     redo A;
4538     } elsif ($self->{nc} == 0x003E) { # >
4539     !!!parse-error (type => 'no attr default'); ## TODO: type
4540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4541     !!!next-input-character;
4542     !!!emit ($self->{ct}); # ATTLIST
4543     redo A;
4544     } elsif ($self->{nc} == -1) {
4545     !!!parse-error (type => 'unclosed md'); ## TODO: type
4546     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4547     !!!next-input-character;
4548     !!!emit ($self->{ct});
4549     redo A;
4550     } else {
4551     !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4552     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4553     ## Reconsume.
4554     redo A;
4555     }
4556     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4557     if ($is_space->{$self->{nc}}) {
4558     ## XML5: No parse error.
4559     !!!parse-error (type => 'no default type'); ## TODO: type
4560 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
4561 wakaba 1.14 ## Reconsume.
4562     redo A;
4563 wakaba 1.15 } elsif ($self->{nc} == 0x0022) { # "
4564     ## XML5: Same as "anything else".
4565     $self->{ca}->{value} = '';
4566     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4567     !!!next-input-character;
4568     redo A;
4569     } elsif ($self->{nc} == 0x0027) { # '
4570     ## XML5: Same as "anything else".
4571     $self->{ca}->{value} = '';
4572     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4573     !!!next-input-character;
4574     redo A;
4575     } elsif ($self->{nc} == 0x003E) { # >
4576     ## XML5: Same as "anything else".
4577     !!!parse-error (type => 'no attr default'); ## TODO: type
4578     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4579     !!!next-input-character;
4580     !!!emit ($self->{ct}); # ATTLIST
4581     redo A;
4582     } elsif ($self->{nc} == -1) {
4583     ## XML5: No parse error.
4584     !!!parse-error (type => 'unclosed md'); ## TODO: type
4585     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4586     !!!next-input-character;
4587     !!!emit ($self->{ct});
4588     redo A;
4589     } else {
4590     $self->{ca}->{default} = chr $self->{nc};
4591     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4592     !!!next-input-character;
4593     redo A;
4594 wakaba 1.14 }
4595 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4596     if ($is_space->{$self->{nc}}) {
4597     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4598     !!!next-input-character;
4599     redo A;
4600     } elsif ($self->{nc} == 0x0022) { # "
4601     ## XML5: Same as "anything else".
4602     !!!parse-error (type => 'no space before default value'); ## TODO: type
4603     $self->{ca}->{value} = '';
4604     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4605     !!!next-input-character;
4606     redo A;
4607     } elsif ($self->{nc} == 0x0027) { # '
4608     ## XML5: Same as "anything else".
4609     !!!parse-error (type => 'no space before default value'); ## TODO: type
4610     $self->{ca}->{value} = '';
4611     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4612     !!!next-input-character;
4613     redo A;
4614     } elsif ($self->{nc} == 0x003E) { # >
4615     ## XML5: Same as "anything else".
4616     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4618     !!!next-input-character;
4619     !!!emit ($self->{ct}); # ATTLIST
4620     redo A;
4621     } elsif ($self->{nc} == -1) {
4622     ## XML5: No parse error.
4623     !!!parse-error (type => 'unclosed md'); ## TODO: type
4624     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4625     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4626     !!!next-input-character;
4627     !!!emit ($self->{ct});
4628     redo A;
4629     } else {
4630     $self->{ca}->{default} .= chr $self->{nc};
4631     ## Stay in the state.
4632     !!!next-input-character;
4633     redo A;
4634     }
4635     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4636     if ($is_space->{$self->{nc}}) {
4637     ## Stay in the state.
4638     !!!next-input-character;
4639     redo A;
4640     } elsif ($self->{nc} == 0x0022) { # "
4641     $self->{ca}->{value} = '';
4642     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4643     !!!next-input-character;
4644     redo A;
4645     } elsif ($self->{nc} == 0x0027) { # '
4646     $self->{ca}->{value} = '';
4647     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4648     !!!next-input-character;
4649     redo A;
4650     } elsif ($self->{nc} == 0x003E) { # >
4651     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4652     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653     !!!next-input-character;
4654     !!!emit ($self->{ct}); # ATTLIST
4655     redo A;
4656     } elsif ($self->{nc} == -1) {
4657     ## XML5: No parse error.
4658     !!!parse-error (type => 'unclosed md'); ## TODO: type
4659     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4660     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4661     !!!next-input-character;
4662     !!!emit ($self->{ct});
4663     redo A;
4664     } else {
4665     ## XML5: Not defined yet.
4666     if ($self->{ca}->{default} eq 'FIXED') {
4667     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4668     } else {
4669     push @{$self->{ct}->{attrdefs}}, $self->{ca};
4670     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4671     }
4672     ## Reconsume.
4673     redo A;
4674     }
4675     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4676     if ($is_space->{$self->{nc}} or
4677     $self->{nc} == -1 or
4678     $self->{nc} == 0x003E) { # >
4679     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4680     ## Reconsume.
4681     redo A;
4682     } else {
4683     !!!parse-error (type => 'no space before attr name'); ## TODO: type
4684     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4685     ## Reconsume.
4686     redo A;
4687 wakaba 1.16 }
4688 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
4689     ## ASCII case-insensitive
4690     if ($self->{nc} == [
4691     undef,
4692     0x0044, # D
4693     0x0041, # A
4694     0x0054, # T
4695     ]->[length $self->{kwd}] or
4696     $self->{nc} == [
4697     undef,
4698     0x0064, # d
4699     0x0061, # a
4700     0x0074, # t
4701     ]->[length $self->{kwd}]) {
4702     !!!cp (172.2);
4703     ## Stay in the state.
4704     $self->{kwd} .= chr $self->{nc};
4705     !!!next-input-character;
4706     redo A;
4707     } elsif ((length $self->{kwd}) == 4 and
4708     ($self->{nc} == 0x0041 or # A
4709     $self->{nc} == 0x0061)) { # a
4710     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4711     !!!cp (172.3);
4712     !!!parse-error (type => 'lowercase keyword', ## TODO: type
4713     text => 'NDATA',
4714     line => $self->{line_prev},
4715     column => $self->{column_prev} - 4);
4716     } else {
4717     !!!cp (172.4);
4718     }
4719     $self->{state} = AFTER_NDATA_STATE;
4720     !!!next-input-character;
4721     redo A;
4722     } else {
4723     !!!parse-error (type => 'string after literal', ## TODO: type
4724     line => $self->{line_prev},
4725     column => $self->{column_prev} + 1
4726     - length $self->{kwd});
4727     !!!cp (172.5);
4728     $self->{state} = BOGUS_MD_STATE;
4729     ## Reconsume.
4730     redo A;
4731     }
4732     } elsif ($self->{state} == AFTER_NDATA_STATE) {
4733     if ($is_space->{$self->{nc}}) {
4734     $self->{state} = BEFORE_NOTATION_NAME_STATE;
4735     !!!next-input-character;
4736     redo A;
4737     } elsif ($self->{nc} == 0x003E) { # >
4738     !!!parse-error (type => 'no notation name'); ## TODO: type
4739     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4740     !!!next-input-character;
4741     !!!emit ($self->{ct}); # ENTITY
4742     redo A;
4743     } elsif ($self->{nc} == -1) {
4744     !!!parse-error (type => 'unclosed md'); ## TODO: type
4745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4746     !!!next-input-character;
4747     !!!emit ($self->{ct}); # ENTITY
4748     redo A;
4749     } else {
4750     !!!parse-error (type => 'string after literal', ## TODO: type
4751     line => $self->{line_prev},
4752     column => $self->{column_prev} + 1
4753     - length $self->{kwd});
4754     $self->{state} = BOGUS_MD_STATE;
4755     ## Reconsume.
4756     redo A;
4757     }
4758     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4759     if ($is_space->{$self->{nc}}) {
4760     ## Stay in the state.
4761     !!!next-input-character;
4762     redo A;
4763     } elsif ($self->{nc} == 0x003E) { # >
4764     !!!parse-error (type => 'no notation name'); ## TODO: type
4765     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4766     !!!next-input-character;
4767     !!!emit ($self->{ct}); # ENTITY
4768     redo A;
4769     } elsif ($self->{nc} == -1) {
4770     !!!parse-error (type => 'unclosed md'); ## TODO: type
4771     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4772     !!!next-input-character;
4773     !!!emit ($self->{ct}); # ENTITY
4774     redo A;
4775     } else {
4776     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4777     $self->{state} = NOTATION_NAME_STATE;
4778     !!!next-input-character;
4779     redo A;
4780     }
4781     } elsif ($self->{state} == NOTATION_NAME_STATE) {
4782     if ($is_space->{$self->{nc}}) {
4783 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4784 wakaba 1.18 !!!next-input-character;
4785     redo A;
4786     } elsif ($self->{nc} == 0x003E) { # >
4787     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4788     !!!next-input-character;
4789     !!!emit ($self->{ct}); # ENTITY
4790     redo A;
4791     } elsif ($self->{nc} == -1) {
4792     !!!parse-error (type => 'unclosed md'); ## TODO: type
4793     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794     !!!next-input-character;
4795     !!!emit ($self->{ct}); # ENTITY
4796     redo A;
4797     } else {
4798     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4799     ## Stay in the state.
4800     !!!next-input-character;
4801     redo A;
4802     }
4803 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4804     if ($self->{nc} == 0x0022) { # "
4805 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4806 wakaba 1.19 !!!next-input-character;
4807     redo A;
4808     } elsif ($self->{nc} == 0x0026) { # &
4809     $self->{prev_state} = $self->{state};
4810     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4811     $self->{entity_add} = 0x0022; # "
4812     !!!next-input-character;
4813     redo A;
4814     ## TODO: %
4815     } elsif ($self->{nc} == -1) {
4816     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4817     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4818     ## Reconsume.
4819     !!!emit ($self->{ct}); # ENTITY
4820     redo A;
4821     } else {
4822     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4823     !!!next-input-character;
4824     redo A;
4825     }
4826     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4827     if ($self->{nc} == 0x0027) { # '
4828 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
4829 wakaba 1.19 !!!next-input-character;
4830     redo A;
4831     } elsif ($self->{nc} == 0x0026) { # &
4832     $self->{prev_state} = $self->{state};
4833     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4834     $self->{entity_add} = 0x0027; # '
4835     !!!next-input-character;
4836     redo A;
4837     ## TODO: %
4838     } elsif ($self->{nc} == -1) {
4839     !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4840     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4841     ## Reconsume.
4842     !!!emit ($self->{ct}); # ENTITY
4843     redo A;
4844     } else {
4845     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4846     !!!next-input-character;
4847     redo A;
4848     }
4849     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4850     if ($is_space->{$self->{nc}} or
4851     {
4852     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4853     $self->{entity_add} => 1,
4854     }->{$self->{nc}}) {
4855 wakaba 1.22 !!!parse-error (type => 'bare ero',
4856     line => $self->{line_prev},
4857     column => $self->{column_prev}
4858     + ($self->{nc} == -1 ? 1 : 0));
4859 wakaba 1.19 ## Don't consume
4860     ## Return nothing.
4861     #
4862     } elsif ($self->{nc} == 0x0023) { # #
4863     $self->{ca} = $self->{ct};
4864     $self->{state} = ENTITY_HASH_STATE;
4865     $self->{kwd} = '#';
4866     !!!next-input-character;
4867     redo A;
4868     } else {
4869     #
4870     }
4871    
4872     $self->{ct}->{value} .= '&';
4873     $self->{state} = $self->{prev_state};
4874     ## Reconsume.
4875     redo A;
4876 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4877     if ($is_space->{$self->{nc}}) {
4878     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4879     !!!next-input-character;
4880     redo A;
4881     } elsif ($self->{nc} == 0x0028) { # (
4882     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4883     $self->{ct}->{content} = ['('];
4884     $self->{group_depth} = 1;
4885     !!!next-input-character;
4886     redo A;
4887     } elsif ($self->{nc} == 0x003E) { # >
4888     !!!parse-error (type => 'no md def'); ## TODO: type
4889     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4890     !!!next-input-character;
4891     !!!emit ($self->{ct}); # ELEMENT
4892     redo A;
4893     } elsif ($self->{nc} == -1) {
4894     !!!parse-error (type => 'unclosed md'); ## TODO: type
4895     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4896     !!!next-input-character;
4897     !!!emit ($self->{ct}); # ELEMENT
4898     redo A;
4899     } else {
4900     $self->{ct}->{content} = [chr $self->{nc}];
4901     $self->{state} = CONTENT_KEYWORD_STATE;
4902     !!!next-input-character;
4903     redo A;
4904     }
4905     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4906     if ($is_space->{$self->{nc}}) {
4907     $self->{state} = AFTER_MD_DEF_STATE;
4908     !!!next-input-character;
4909     redo A;
4910     } elsif ($self->{nc} == 0x003E) { # >
4911     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4912     !!!next-input-character;
4913     !!!emit ($self->{ct}); # ELEMENT
4914     redo A;
4915     } elsif ($self->{nc} == -1) {
4916     !!!parse-error (type => 'unclosed md'); ## TODO: type
4917     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4918     !!!next-input-character;
4919     !!!emit ($self->{ct}); # ELEMENT
4920     redo A;
4921     } else {
4922     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4923     ## Stay in the state.
4924     !!!next-input-character;
4925     redo A;
4926     }
4927     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4928     if ($is_space->{$self->{nc}}) {
4929     ## Stay in the state.
4930     !!!next-input-character;
4931     redo A;
4932     } elsif ($self->{nc} == 0x0028) { # (
4933     $self->{group_depth}++;
4934     push @{$self->{ct}->{content}}, chr $self->{nc};
4935     ## Stay in the state.
4936     !!!next-input-character;
4937     redo A;
4938     } elsif ($self->{nc} == 0x007C or # |
4939     $self->{nc} == 0x002C) { # ,
4940     !!!parse-error (type => 'empty element name'); ## TODO: type
4941     ## Stay in the state.
4942     !!!next-input-character;
4943     redo A;
4944     } elsif ($self->{nc} == 0x0029) { # )
4945     !!!parse-error (type => 'empty element name'); ## TODO: type
4946     push @{$self->{ct}->{content}}, chr $self->{nc};
4947     $self->{group_depth}--;
4948     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4949     !!!next-input-character;
4950     redo A;
4951     } elsif ($self->{nc} == 0x003E) { # >
4952     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4953     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4955     !!!next-input-character;
4956     !!!emit ($self->{ct}); # ELEMENT
4957     redo A;
4958     } elsif ($self->{nc} == -1) {
4959     !!!parse-error (type => 'unclosed md'); ## TODO: type
4960     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4961     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4962     !!!next-input-character;
4963     !!!emit ($self->{ct}); # ELEMENT
4964     redo A;
4965     } else {
4966     push @{$self->{ct}->{content}}, chr $self->{nc};
4967     $self->{state} = CM_ELEMENT_NAME_STATE;
4968     !!!next-input-character;
4969     redo A;
4970     }
4971     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4972     if ($is_space->{$self->{nc}}) {
4973     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4974     !!!next-input-character;
4975     redo A;
4976     } elsif ($self->{nc} == 0x002A or # *
4977     $self->{nc} == 0x002B or # +
4978     $self->{nc} == 0x003F) { # ?
4979     push @{$self->{ct}->{content}}, chr $self->{nc};
4980     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4981     !!!next-input-character;
4982     redo A;
4983     } elsif ($self->{nc} == 0x007C or # |
4984     $self->{nc} == 0x002C) { # ,
4985     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4986     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4987     !!!next-input-character;
4988     redo A;
4989     } elsif ($self->{nc} == 0x0029) { # )
4990     $self->{group_depth}--;
4991     push @{$self->{ct}->{content}}, chr $self->{nc};
4992     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4993     !!!next-input-character;
4994     redo A;
4995     } elsif ($self->{nc} == 0x003E) { # >
4996     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4997     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4999     !!!next-input-character;
5000     !!!emit ($self->{ct}); # ELEMENT
5001     redo A;
5002     } elsif ($self->{nc} == -1) {
5003     !!!parse-error (type => 'unclosed md'); ## TODO: type
5004     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5005     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5006     !!!next-input-character;
5007     !!!emit ($self->{ct}); # ELEMENT
5008     redo A;
5009     } else {
5010     $self->{ct}->{content}->[-1] .= chr $self->{nc};
5011     ## Stay in the state.
5012     !!!next-input-character;
5013     redo A;
5014     }
5015     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5016     if ($is_space->{$self->{nc}}) {
5017     ## Stay in the state.
5018     !!!next-input-character;
5019     redo A;
5020     } elsif ($self->{nc} == 0x007C or # |
5021     $self->{nc} == 0x002C) { # ,
5022     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5023     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5024     !!!next-input-character;
5025     redo A;
5026     } elsif ($self->{nc} == 0x0029) { # )
5027     $self->{group_depth}--;
5028     push @{$self->{ct}->{content}}, chr $self->{nc};
5029     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5030     !!!next-input-character;
5031     redo A;
5032     } elsif ($self->{nc} == 0x003E) { # >
5033     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5034     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5036     !!!next-input-character;
5037     !!!emit ($self->{ct}); # ELEMENT
5038     redo A;
5039     } elsif ($self->{nc} == -1) {
5040     !!!parse-error (type => 'unclosed md'); ## TODO: type
5041     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5042     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5043     !!!next-input-character;
5044     !!!emit ($self->{ct}); # ELEMENT
5045     redo A;
5046     } else {
5047     !!!parse-error (type => 'after element name'); ## TODO: type
5048     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5049     $self->{state} = BOGUS_MD_STATE;
5050     !!!next-input-character;
5051     redo A;
5052     }
5053     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5054     if ($is_space->{$self->{nc}}) {
5055     if ($self->{group_depth}) {
5056     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5057     } else {
5058     $self->{state} = AFTER_MD_DEF_STATE;
5059     }
5060     !!!next-input-character;
5061     redo A;
5062     } elsif ($self->{nc} == 0x002A or # *
5063     $self->{nc} == 0x002B or # +
5064     $self->{nc} == 0x003F) { # ?
5065     push @{$self->{ct}->{content}}, chr $self->{nc};
5066     if ($self->{group_depth}) {
5067     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5068     } else {
5069     $self->{state} = AFTER_MD_DEF_STATE;
5070     }
5071     !!!next-input-character;
5072     redo A;
5073     } elsif ($self->{nc} == 0x0029) { # )
5074     if ($self->{group_depth}) {
5075     $self->{group_depth}--;
5076     push @{$self->{ct}->{content}}, chr $self->{nc};
5077     ## Stay in the state.
5078     !!!next-input-character;
5079     redo A;
5080     } else {
5081     !!!parse-error (type => 'string after md def'); ## TODO: type
5082     $self->{state} = BOGUS_MD_STATE;
5083     ## Reconsume.
5084     redo A;
5085     }
5086     } elsif ($self->{nc} == 0x003E) { # >
5087     if ($self->{group_depth}) {
5088     !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5089     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5090     }
5091     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5092     !!!next-input-character;
5093     !!!emit ($self->{ct}); # ELEMENT
5094     redo A;
5095     } elsif ($self->{nc} == -1) {
5096     !!!parse-error (type => 'unclosed md'); ## TODO: type
5097     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5099     !!!next-input-character;
5100     !!!emit ($self->{ct}); # ELEMENT
5101     redo A;
5102     } else {
5103     if ($self->{group_depth}) {
5104     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5105     } else {
5106     !!!parse-error (type => 'string after md def'); ## TODO: type
5107     $self->{state} = BOGUS_MD_STATE;
5108     }
5109     ## Reconsume.
5110     redo A;
5111     }
5112     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5113 wakaba 1.18 if ($is_space->{$self->{nc}}) {
5114     ## Stay in the state.
5115     !!!next-input-character;
5116     redo A;
5117     } elsif ($self->{nc} == 0x003E) { # >
5118     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5119     !!!next-input-character;
5120 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5121 wakaba 1.18 redo A;
5122     } elsif ($self->{nc} == -1) {
5123     !!!parse-error (type => 'unclosed md'); ## TODO: type
5124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5125     !!!next-input-character;
5126 wakaba 1.20 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5127 wakaba 1.18 redo A;
5128     } else {
5129 wakaba 1.20 !!!parse-error (type => 'string after md def'); ## TODO: type
5130 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
5131     ## Reconsume.
5132     redo A;
5133     }
5134 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
5135     if ($self->{nc} == 0x003E) { # >
5136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5137     !!!next-input-character;
5138     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5139     redo A;
5140     } elsif ($self->{nc} == -1) {
5141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5142     ## Reconsume.
5143     !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5144     redo A;
5145     } else {
5146     ## Stay in the state.
5147     !!!next-input-character;
5148     redo A;
5149     }
5150 wakaba 1.1 } else {
5151     die "$0: $self->{state}: Unknown state";
5152     }
5153     } # A
5154    
5155     die "$0: _get_next_token: unexpected case";
5156     } # _get_next_token
5157    
5158     1;
5159 wakaba 1.30 ## $Date: 2009/08/16 04:06:34 $
5160 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24