/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download) (as text)
Fri Oct 17 07:14:29 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.13: +380 -13 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	17 Oct 2008 07:14:01 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attlists-1.dat" added.

++ whatpm/t/xml/ChangeLog	17 Oct 2008 07:14:24 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat: New test data file.

	* doctypes-2.dat: New tests added.

++ whatpm/Whatpm/ChangeLog	17 Oct 2008 07:11:25 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (node_name): New attribute.
	(ELEMENT_TYPE_DEFINITION_NODE, ATTRIBUTE_DEFINITION_NODE): New
	constants.
	(create_element_type_definition_node, create_attribute_definition,
	create_notation, create_general_entity,
	get_element_type_definition_node,
	set_element_type_definition_node, get_general_entity_node,
	set_general_entity_node, get_notation_node, set_notation_node,
	get_attribute_definition_node, set_attribute_definition_node): New
	methods.
	(element_types, entities, notations, attribute_definitions): New
	attributes.
	(DocumentType): Support for child nodes, entities, notations, and
	element types.
	(Entity, Notation, ElementTypeDefinition, AttributeDefinition):
	New classes.

	* Dumper.pm: Support for general entities, notations, element type
	definitions, and attribute definitions.

++ whatpm/Whatpm/HTML/ChangeLog	17 Oct 2008 07:12:26 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token types AtTLIST_TOKEN, ELEMENT_TOKEN,
	GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and NOTATION_TOKEN
	are added.  New intertion modes for markup declarations are added.

++ whatpm/Whatpm/XML/ChangeLog	17 Oct 2008 07:13:47 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): Support for ELEMENT_TOKEN,
	ATTLIST_TOKEN, GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and
	NOTATION_TOKEN.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.14 our $VERSION=do{my @r=(q$Revision: 1.13 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.8
168 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
169     ## list and descriptions)
170    
171     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
172     sub FOREIGN_EL () { 0b1_00000000000 }
173    
174     ## Character reference mappings
175    
176     my $charref_map = {
177     0x0D => 0x000A,
178     0x80 => 0x20AC,
179     0x81 => 0xFFFD,
180     0x82 => 0x201A,
181     0x83 => 0x0192,
182     0x84 => 0x201E,
183     0x85 => 0x2026,
184     0x86 => 0x2020,
185     0x87 => 0x2021,
186     0x88 => 0x02C6,
187     0x89 => 0x2030,
188     0x8A => 0x0160,
189     0x8B => 0x2039,
190     0x8C => 0x0152,
191     0x8D => 0xFFFD,
192     0x8E => 0x017D,
193     0x8F => 0xFFFD,
194     0x90 => 0xFFFD,
195     0x91 => 0x2018,
196     0x92 => 0x2019,
197     0x93 => 0x201C,
198     0x94 => 0x201D,
199     0x95 => 0x2022,
200     0x96 => 0x2013,
201     0x97 => 0x2014,
202     0x98 => 0x02DC,
203     0x99 => 0x2122,
204     0x9A => 0x0161,
205     0x9B => 0x203A,
206     0x9C => 0x0153,
207     0x9D => 0xFFFD,
208     0x9E => 0x017E,
209     0x9F => 0x0178,
210     }; # $charref_map
211     $charref_map->{$_} = 0xFFFD
212     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
213     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
214     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
215     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
216     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
217     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
218     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
219    
220     ## Implementations MUST act as if state machine in the spec
221    
222     sub _initialize_tokenizer ($) {
223     my $self = shift;
224    
225     ## NOTE: Fields set by |new| constructor:
226     #$self->{level}
227     #$self->{set_nc}
228     #$self->{parse_error}
229 wakaba 1.3 #$self->{is_xml} (if XML)
230 wakaba 1.1
231     $self->{state} = DATA_STATE; # MUST
232 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
233     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
234 wakaba 1.1 #$self->{entity__value}; # initialized when used
235     #$self->{entity__match}; # initialized when used
236     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
237     undef $self->{ct}; # current token
238     undef $self->{ca}; # current attribute
239     undef $self->{last_stag_name}; # last emitted start tag name
240     #$self->{prev_state}; # initialized when used
241     delete $self->{self_closing};
242     $self->{char_buffer} = '';
243     $self->{char_buffer_pos} = 0;
244     $self->{nc} = -1; # next input character
245     #$self->{next_nc}
246     !!!next-input-character;
247     $self->{token} = [];
248     # $self->{escape}
249     } # _initialize_tokenizer
250    
251     ## A token has:
252     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
253 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
254 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
255     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
256 wakaba 1.11 ## ->{target} (PI_TOKEN)
257 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
258     ## ->{sysid} (DOCTYPE_TOKEN)
259     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
260     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
261     ## ->{name}
262     ## ->{value}
263     ## ->{has_reference} == 1 or 0
264 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
265     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
266 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
267 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
268 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
269    
270 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
271     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
272     ## while the token is pushed back to the stack.
273    
274     ## Emitted token MUST immediately be handled by the tree construction state.
275    
276     ## Before each step, UA MAY check to see if either one of the scripts in
277     ## "list of scripts that will execute as soon as possible" or the first
278     ## script in the "list of scripts that will execute asynchronously",
279     ## has completed loading. If one has, then it MUST be executed
280     ## and removed from the list.
281    
282     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
283     ## (This requirement was dropped from HTML5 spec, unfortunately.)
284    
285     my $is_space = {
286     0x0009 => 1, # CHARACTER TABULATION (HT)
287     0x000A => 1, # LINE FEED (LF)
288     #0x000B => 0, # LINE TABULATION (VT)
289 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
290 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
291     0x0020 => 1, # SPACE (SP)
292     };
293    
294     sub _get_next_token ($) {
295     my $self = shift;
296    
297     if ($self->{self_closing}) {
298     !!!parse-error (type => 'nestc', token => $self->{ct});
299     ## NOTE: The |self_closing| flag is only set by start tag token.
300     ## In addition, when a start tag token is emitted, it is always set to
301     ## |ct|.
302     delete $self->{self_closing};
303     }
304    
305     if (@{$self->{token}}) {
306     $self->{self_closing} = $self->{token}->[0]->{self_closing};
307     return shift @{$self->{token}};
308     }
309    
310     A: {
311     if ($self->{state} == PCDATA_STATE) {
312     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
313    
314     if ($self->{nc} == 0x0026) { # &
315     !!!cp (0.1);
316     ## NOTE: In the spec, the tokenizer is switched to the
317     ## "entity data state". In this implementation, the tokenizer
318     ## is switched to the |ENTITY_STATE|, which is an implementation
319     ## of the "consume a character reference" algorithm.
320     $self->{entity_add} = -1;
321     $self->{prev_state} = DATA_STATE;
322     $self->{state} = ENTITY_STATE;
323     !!!next-input-character;
324     redo A;
325     } elsif ($self->{nc} == 0x003C) { # <
326     !!!cp (0.2);
327     $self->{state} = TAG_OPEN_STATE;
328     !!!next-input-character;
329     redo A;
330     } elsif ($self->{nc} == -1) {
331     !!!cp (0.3);
332     !!!emit ({type => END_OF_FILE_TOKEN,
333     line => $self->{line}, column => $self->{column}});
334     last A; ## TODO: ok?
335     } else {
336     !!!cp (0.4);
337     #
338     }
339    
340     # Anything else
341     my $token = {type => CHARACTER_TOKEN,
342     data => chr $self->{nc},
343     line => $self->{line}, column => $self->{column},
344     };
345     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
346    
347     ## Stay in the state.
348     !!!next-input-character;
349     !!!emit ($token);
350     redo A;
351     } elsif ($self->{state} == DATA_STATE) {
352     $self->{s_kwd} = '' unless defined $self->{s_kwd};
353     if ($self->{nc} == 0x0026) { # &
354     $self->{s_kwd} = '';
355     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
356     not $self->{escape}) {
357     !!!cp (1);
358     ## NOTE: In the spec, the tokenizer is switched to the
359     ## "entity data state". In this implementation, the tokenizer
360     ## is switched to the |ENTITY_STATE|, which is an implementation
361     ## of the "consume a character reference" algorithm.
362     $self->{entity_add} = -1;
363     $self->{prev_state} = DATA_STATE;
364     $self->{state} = ENTITY_STATE;
365     !!!next-input-character;
366     redo A;
367     } else {
368     !!!cp (2);
369     #
370     }
371     } elsif ($self->{nc} == 0x002D) { # -
372     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
373 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
374 wakaba 1.1 !!!cp (3);
375     $self->{escape} = 1; # unless $self->{escape};
376     $self->{s_kwd} = '--';
377     #
378 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
379 wakaba 1.1 !!!cp (4);
380     $self->{s_kwd} = '--';
381     #
382 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
383     !!!cp (4.1);
384     $self->{s_kwd} .= '-';
385     #
386 wakaba 1.1 } else {
387     !!!cp (5);
388 wakaba 1.5 $self->{s_kwd} = '-';
389 wakaba 1.1 #
390     }
391     }
392    
393     #
394     } elsif ($self->{nc} == 0x0021) { # !
395     if (length $self->{s_kwd}) {
396     !!!cp (5.1);
397     $self->{s_kwd} .= '!';
398     #
399     } else {
400     !!!cp (5.2);
401     #$self->{s_kwd} = '';
402     #
403     }
404     #
405     } elsif ($self->{nc} == 0x003C) { # <
406     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
407     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
408     not $self->{escape})) {
409     !!!cp (6);
410     $self->{state} = TAG_OPEN_STATE;
411     !!!next-input-character;
412     redo A;
413     } else {
414     !!!cp (7);
415     $self->{s_kwd} = '';
416     #
417     }
418     } elsif ($self->{nc} == 0x003E) { # >
419     if ($self->{escape} and
420     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
421     if ($self->{s_kwd} eq '--') {
422     !!!cp (8);
423     delete $self->{escape};
424 wakaba 1.5 #
425 wakaba 1.1 } else {
426     !!!cp (9);
427 wakaba 1.5 #
428 wakaba 1.1 }
429 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
430     !!!cp (9.1);
431     !!!parse-error (type => 'unmatched mse', ## TODO: type
432     line => $self->{line_prev},
433     column => $self->{column_prev} - 1);
434     #
435 wakaba 1.1 } else {
436     !!!cp (10);
437 wakaba 1.5 #
438 wakaba 1.1 }
439    
440     $self->{s_kwd} = '';
441     #
442 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
443     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
444     !!!cp (10.1);
445     $self->{s_kwd} .= ']';
446     } elsif ($self->{s_kwd} eq ']]') {
447     !!!cp (10.2);
448     #
449     } else {
450     !!!cp (10.3);
451     $self->{s_kwd} = '';
452     }
453     #
454 wakaba 1.1 } elsif ($self->{nc} == -1) {
455     !!!cp (11);
456     $self->{s_kwd} = '';
457     !!!emit ({type => END_OF_FILE_TOKEN,
458     line => $self->{line}, column => $self->{column}});
459     last A; ## TODO: ok?
460     } else {
461     !!!cp (12);
462     $self->{s_kwd} = '';
463     #
464     }
465    
466     # Anything else
467     my $token = {type => CHARACTER_TOKEN,
468     data => chr $self->{nc},
469     line => $self->{line}, column => $self->{column},
470     };
471 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
472 wakaba 1.1 length $token->{data})) {
473     $self->{s_kwd} = '';
474     }
475    
476     ## Stay in the data state.
477 wakaba 1.5 if (not $self->{is_xml} and
478     $self->{content_model} == PCDATA_CONTENT_MODEL) {
479 wakaba 1.1 !!!cp (13);
480     $self->{state} = PCDATA_STATE;
481     } else {
482     !!!cp (14);
483     ## Stay in the state.
484     }
485     !!!next-input-character;
486     !!!emit ($token);
487     redo A;
488     } elsif ($self->{state} == TAG_OPEN_STATE) {
489 wakaba 1.10 ## XML5: "tag state".
490    
491 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
492     if ($self->{nc} == 0x002F) { # /
493     !!!cp (15);
494     !!!next-input-character;
495     $self->{state} = CLOSE_TAG_OPEN_STATE;
496     redo A;
497     } elsif ($self->{nc} == 0x0021) { # !
498     !!!cp (15.1);
499 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
500 wakaba 1.1 #
501     } else {
502     !!!cp (16);
503 wakaba 1.12 $self->{s_kwd} = '';
504 wakaba 1.1 #
505     }
506    
507     ## reconsume
508     $self->{state} = DATA_STATE;
509     !!!emit ({type => CHARACTER_TOKEN, data => '<',
510     line => $self->{line_prev},
511     column => $self->{column_prev},
512     });
513     redo A;
514     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
515     if ($self->{nc} == 0x0021) { # !
516     !!!cp (17);
517     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
518     !!!next-input-character;
519     redo A;
520     } elsif ($self->{nc} == 0x002F) { # /
521     !!!cp (18);
522     $self->{state} = CLOSE_TAG_OPEN_STATE;
523     !!!next-input-character;
524     redo A;
525     } elsif (0x0041 <= $self->{nc} and
526     $self->{nc} <= 0x005A) { # A..Z
527     !!!cp (19);
528     $self->{ct}
529     = {type => START_TAG_TOKEN,
530 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
531 wakaba 1.1 line => $self->{line_prev},
532     column => $self->{column_prev}};
533     $self->{state} = TAG_NAME_STATE;
534     !!!next-input-character;
535     redo A;
536     } elsif (0x0061 <= $self->{nc} and
537     $self->{nc} <= 0x007A) { # a..z
538     !!!cp (20);
539     $self->{ct} = {type => START_TAG_TOKEN,
540     tag_name => chr ($self->{nc}),
541     line => $self->{line_prev},
542     column => $self->{column_prev}};
543     $self->{state} = TAG_NAME_STATE;
544     !!!next-input-character;
545     redo A;
546     } elsif ($self->{nc} == 0x003E) { # >
547     !!!cp (21);
548     !!!parse-error (type => 'empty start tag',
549     line => $self->{line_prev},
550     column => $self->{column_prev});
551     $self->{state} = DATA_STATE;
552 wakaba 1.5 $self->{s_kwd} = '';
553 wakaba 1.1 !!!next-input-character;
554    
555     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
556     line => $self->{line_prev},
557     column => $self->{column_prev},
558     });
559    
560     redo A;
561     } elsif ($self->{nc} == 0x003F) { # ?
562 wakaba 1.8 if ($self->{is_xml}) {
563     !!!cp (22.1);
564     $self->{state} = PI_STATE;
565     !!!next-input-character;
566     redo A;
567     } else {
568     !!!cp (22);
569     !!!parse-error (type => 'pio',
570     line => $self->{line_prev},
571     column => $self->{column_prev});
572     $self->{state} = BOGUS_COMMENT_STATE;
573     $self->{ct} = {type => COMMENT_TOKEN, data => '',
574     line => $self->{line_prev},
575     column => $self->{column_prev},
576     };
577     ## $self->{nc} is intentionally left as is
578     redo A;
579     }
580 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
581 wakaba 1.1 !!!cp (23);
582     !!!parse-error (type => 'bare stago',
583     line => $self->{line_prev},
584     column => $self->{column_prev});
585     $self->{state} = DATA_STATE;
586 wakaba 1.5 $self->{s_kwd} = '';
587 wakaba 1.1 ## reconsume
588    
589     !!!emit ({type => CHARACTER_TOKEN, data => '<',
590     line => $self->{line_prev},
591     column => $self->{column_prev},
592     });
593    
594     redo A;
595 wakaba 1.9 } else {
596     ## XML5: "<:" is a parse error.
597     !!!cp (23.1);
598     $self->{ct} = {type => START_TAG_TOKEN,
599     tag_name => chr ($self->{nc}),
600     line => $self->{line_prev},
601     column => $self->{column_prev}};
602     $self->{state} = TAG_NAME_STATE;
603     !!!next-input-character;
604     redo A;
605 wakaba 1.1 }
606     } else {
607     die "$0: $self->{content_model} in tag open";
608     }
609     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
610     ## NOTE: The "close tag open state" in the spec is implemented as
611     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
612    
613 wakaba 1.10 ## XML5: "end tag state".
614    
615 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
616     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
617     if (defined $self->{last_stag_name}) {
618     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
619 wakaba 1.12 $self->{kwd} = '';
620 wakaba 1.1 ## Reconsume.
621     redo A;
622     } else {
623     ## No start tag token has ever been emitted
624     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
625     !!!cp (28);
626     $self->{state} = DATA_STATE;
627 wakaba 1.5 $self->{s_kwd} = '';
628 wakaba 1.1 ## Reconsume.
629     !!!emit ({type => CHARACTER_TOKEN, data => '</',
630     line => $l, column => $c,
631     });
632     redo A;
633     }
634     }
635    
636     if (0x0041 <= $self->{nc} and
637     $self->{nc} <= 0x005A) { # A..Z
638     !!!cp (29);
639     $self->{ct}
640     = {type => END_TAG_TOKEN,
641 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
642 wakaba 1.1 line => $l, column => $c};
643     $self->{state} = TAG_NAME_STATE;
644     !!!next-input-character;
645     redo A;
646     } elsif (0x0061 <= $self->{nc} and
647     $self->{nc} <= 0x007A) { # a..z
648     !!!cp (30);
649     $self->{ct} = {type => END_TAG_TOKEN,
650     tag_name => chr ($self->{nc}),
651     line => $l, column => $c};
652     $self->{state} = TAG_NAME_STATE;
653     !!!next-input-character;
654     redo A;
655     } elsif ($self->{nc} == 0x003E) { # >
656     !!!parse-error (type => 'empty end tag',
657     line => $self->{line_prev}, ## "<" in "</>"
658     column => $self->{column_prev} - 1);
659     $self->{state} = DATA_STATE;
660 wakaba 1.5 $self->{s_kwd} = '';
661 wakaba 1.10 if ($self->{is_xml}) {
662     !!!cp (31);
663     ## XML5: No parse error.
664    
665     ## NOTE: This parser raises a parse error, since it supports
666     ## XML1, not XML5.
667    
668     ## NOTE: A short end tag token.
669     my $ct = {type => END_TAG_TOKEN,
670     tag_name => '',
671     line => $self->{line_prev},
672     column => $self->{column_prev} - 1,
673     };
674     !!!next-input-character;
675     !!!emit ($ct);
676     } else {
677     !!!cp (31.1);
678     !!!next-input-character;
679     }
680 wakaba 1.1 redo A;
681     } elsif ($self->{nc} == -1) {
682     !!!cp (32);
683     !!!parse-error (type => 'bare etago');
684 wakaba 1.5 $self->{s_kwd} = '';
685 wakaba 1.1 $self->{state} = DATA_STATE;
686     # reconsume
687    
688     !!!emit ({type => CHARACTER_TOKEN, data => '</',
689     line => $l, column => $c,
690     });
691    
692     redo A;
693 wakaba 1.10 } elsif (not $self->{is_xml} or
694     $is_space->{$self->{nc}}) {
695 wakaba 1.1 !!!cp (33);
696 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
697     line => $self->{line_prev}, # "<" of "</"
698     column => $self->{column_prev} - 1);
699 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
700     $self->{ct} = {type => COMMENT_TOKEN, data => '',
701     line => $self->{line_prev}, # "<" of "</"
702     column => $self->{column_prev} - 1,
703     };
704     ## NOTE: $self->{nc} is intentionally left as is.
705     ## Although the "anything else" case of the spec not explicitly
706     ## states that the next input character is to be reconsumed,
707     ## it will be included to the |data| of the comment token
708     ## generated from the bogus end tag, as defined in the
709     ## "bogus comment state" entry.
710     redo A;
711 wakaba 1.10 } else {
712     ## XML5: "</:" is a parse error.
713     !!!cp (30.1);
714     $self->{ct} = {type => END_TAG_TOKEN,
715     tag_name => chr ($self->{nc}),
716     line => $l, column => $c};
717     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
718     !!!next-input-character;
719     redo A;
720 wakaba 1.1 }
721     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
722 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
723 wakaba 1.1 if (length $ch) {
724     my $CH = $ch;
725     $ch =~ tr/a-z/A-Z/;
726     my $nch = chr $self->{nc};
727     if ($nch eq $ch or $nch eq $CH) {
728     !!!cp (24);
729     ## Stay in the state.
730 wakaba 1.12 $self->{kwd} .= $nch;
731 wakaba 1.1 !!!next-input-character;
732     redo A;
733     } else {
734     !!!cp (25);
735     $self->{state} = DATA_STATE;
736 wakaba 1.5 $self->{s_kwd} = '';
737 wakaba 1.1 ## Reconsume.
738     !!!emit ({type => CHARACTER_TOKEN,
739 wakaba 1.12 data => '</' . $self->{kwd},
740 wakaba 1.1 line => $self->{line_prev},
741 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
742 wakaba 1.1 });
743     redo A;
744     }
745     } else { # after "<{tag-name}"
746     unless ($is_space->{$self->{nc}} or
747     {
748     0x003E => 1, # >
749     0x002F => 1, # /
750     -1 => 1, # EOF
751     }->{$self->{nc}}) {
752     !!!cp (26);
753     ## Reconsume.
754     $self->{state} = DATA_STATE;
755 wakaba 1.5 $self->{s_kwd} = '';
756 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
757 wakaba 1.12 data => '</' . $self->{kwd},
758 wakaba 1.1 line => $self->{line_prev},
759 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
760 wakaba 1.1 });
761     redo A;
762     } else {
763     !!!cp (27);
764     $self->{ct}
765     = {type => END_TAG_TOKEN,
766     tag_name => $self->{last_stag_name},
767     line => $self->{line_prev},
768 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
769 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
770     ## Reconsume.
771     redo A;
772     }
773     }
774     } elsif ($self->{state} == TAG_NAME_STATE) {
775     if ($is_space->{$self->{nc}}) {
776     !!!cp (34);
777     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
778     !!!next-input-character;
779     redo A;
780     } elsif ($self->{nc} == 0x003E) { # >
781     if ($self->{ct}->{type} == START_TAG_TOKEN) {
782     !!!cp (35);
783     $self->{last_stag_name} = $self->{ct}->{tag_name};
784     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
785     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
786     #if ($self->{ct}->{attributes}) {
787     # ## NOTE: This should never be reached.
788     # !!! cp (36);
789     # !!! parse-error (type => 'end tag attribute');
790     #} else {
791     !!!cp (37);
792     #}
793     } else {
794     die "$0: $self->{ct}->{type}: Unknown token type";
795     }
796     $self->{state} = DATA_STATE;
797 wakaba 1.5 $self->{s_kwd} = '';
798 wakaba 1.1 !!!next-input-character;
799    
800     !!!emit ($self->{ct}); # start tag or end tag
801    
802     redo A;
803     } elsif (0x0041 <= $self->{nc} and
804     $self->{nc} <= 0x005A) { # A..Z
805     !!!cp (38);
806 wakaba 1.4 $self->{ct}->{tag_name}
807     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
808 wakaba 1.1 # start tag or end tag
809     ## Stay in this state
810     !!!next-input-character;
811     redo A;
812     } elsif ($self->{nc} == -1) {
813     !!!parse-error (type => 'unclosed tag');
814     if ($self->{ct}->{type} == START_TAG_TOKEN) {
815     !!!cp (39);
816     $self->{last_stag_name} = $self->{ct}->{tag_name};
817     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
818     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
819     #if ($self->{ct}->{attributes}) {
820     # ## NOTE: This state should never be reached.
821     # !!! cp (40);
822     # !!! parse-error (type => 'end tag attribute');
823     #} else {
824     !!!cp (41);
825     #}
826     } else {
827     die "$0: $self->{ct}->{type}: Unknown token type";
828     }
829     $self->{state} = DATA_STATE;
830 wakaba 1.5 $self->{s_kwd} = '';
831 wakaba 1.1 # reconsume
832    
833     !!!emit ($self->{ct}); # start tag or end tag
834    
835     redo A;
836     } elsif ($self->{nc} == 0x002F) { # /
837     !!!cp (42);
838     $self->{state} = SELF_CLOSING_START_TAG_STATE;
839     !!!next-input-character;
840     redo A;
841     } else {
842     !!!cp (44);
843     $self->{ct}->{tag_name} .= chr $self->{nc};
844     # start tag or end tag
845     ## Stay in the state
846     !!!next-input-character;
847     redo A;
848     }
849     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
850 wakaba 1.11 ## XML5: "Tag attribute name before state".
851    
852 wakaba 1.1 if ($is_space->{$self->{nc}}) {
853     !!!cp (45);
854     ## Stay in the state
855     !!!next-input-character;
856     redo A;
857     } elsif ($self->{nc} == 0x003E) { # >
858     if ($self->{ct}->{type} == START_TAG_TOKEN) {
859     !!!cp (46);
860     $self->{last_stag_name} = $self->{ct}->{tag_name};
861     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
862     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
863     if ($self->{ct}->{attributes}) {
864     !!!cp (47);
865     !!!parse-error (type => 'end tag attribute');
866     } else {
867     !!!cp (48);
868     }
869     } else {
870     die "$0: $self->{ct}->{type}: Unknown token type";
871     }
872     $self->{state} = DATA_STATE;
873 wakaba 1.5 $self->{s_kwd} = '';
874 wakaba 1.1 !!!next-input-character;
875    
876     !!!emit ($self->{ct}); # start tag or end tag
877    
878     redo A;
879     } elsif (0x0041 <= $self->{nc} and
880     $self->{nc} <= 0x005A) { # A..Z
881     !!!cp (49);
882     $self->{ca}
883 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
884 wakaba 1.1 value => '',
885     line => $self->{line}, column => $self->{column}};
886     $self->{state} = ATTRIBUTE_NAME_STATE;
887     !!!next-input-character;
888     redo A;
889     } elsif ($self->{nc} == 0x002F) { # /
890     !!!cp (50);
891     $self->{state} = SELF_CLOSING_START_TAG_STATE;
892     !!!next-input-character;
893     redo A;
894     } elsif ($self->{nc} == -1) {
895     !!!parse-error (type => 'unclosed tag');
896     if ($self->{ct}->{type} == START_TAG_TOKEN) {
897     !!!cp (52);
898     $self->{last_stag_name} = $self->{ct}->{tag_name};
899     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
900     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
901     if ($self->{ct}->{attributes}) {
902     !!!cp (53);
903     !!!parse-error (type => 'end tag attribute');
904     } else {
905     !!!cp (54);
906     }
907     } else {
908     die "$0: $self->{ct}->{type}: Unknown token type";
909     }
910     $self->{state} = DATA_STATE;
911 wakaba 1.5 $self->{s_kwd} = '';
912 wakaba 1.1 # reconsume
913    
914     !!!emit ($self->{ct}); # start tag or end tag
915    
916     redo A;
917     } else {
918     if ({
919     0x0022 => 1, # "
920     0x0027 => 1, # '
921     0x003D => 1, # =
922     }->{$self->{nc}}) {
923     !!!cp (55);
924 wakaba 1.11 ## XML5: Not a parse error.
925 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
926     } else {
927     !!!cp (56);
928 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
929 wakaba 1.1 }
930     $self->{ca}
931     = {name => chr ($self->{nc}),
932     value => '',
933     line => $self->{line}, column => $self->{column}};
934     $self->{state} = ATTRIBUTE_NAME_STATE;
935     !!!next-input-character;
936     redo A;
937     }
938     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
939 wakaba 1.11 ## XML5: "Tag attribute name state".
940    
941 wakaba 1.1 my $before_leave = sub {
942     if (exists $self->{ct}->{attributes} # start tag or end tag
943     ->{$self->{ca}->{name}}) { # MUST
944     !!!cp (57);
945     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
946     ## Discard $self->{ca} # MUST
947     } else {
948     !!!cp (58);
949     $self->{ct}->{attributes}->{$self->{ca}->{name}}
950     = $self->{ca};
951 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
952 wakaba 1.1 }
953     }; # $before_leave
954    
955     if ($is_space->{$self->{nc}}) {
956     !!!cp (59);
957     $before_leave->();
958     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
959     !!!next-input-character;
960     redo A;
961     } elsif ($self->{nc} == 0x003D) { # =
962     !!!cp (60);
963     $before_leave->();
964     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
965     !!!next-input-character;
966     redo A;
967     } elsif ($self->{nc} == 0x003E) { # >
968 wakaba 1.11 if ($self->{is_xml}) {
969     !!!cp (60.1);
970     ## XML5: Not a parse error.
971     !!!parse-error (type => 'no attr value'); ## TODO: type
972     } else {
973     !!!cp (60.2);
974     }
975    
976 wakaba 1.1 $before_leave->();
977     if ($self->{ct}->{type} == START_TAG_TOKEN) {
978     !!!cp (61);
979     $self->{last_stag_name} = $self->{ct}->{tag_name};
980     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
981     !!!cp (62);
982     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
983     if ($self->{ct}->{attributes}) {
984     !!!parse-error (type => 'end tag attribute');
985     }
986     } else {
987     die "$0: $self->{ct}->{type}: Unknown token type";
988     }
989     $self->{state} = DATA_STATE;
990 wakaba 1.5 $self->{s_kwd} = '';
991 wakaba 1.1 !!!next-input-character;
992    
993     !!!emit ($self->{ct}); # start tag or end tag
994    
995     redo A;
996     } elsif (0x0041 <= $self->{nc} and
997     $self->{nc} <= 0x005A) { # A..Z
998     !!!cp (63);
999 wakaba 1.4 $self->{ca}->{name}
1000     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1001 wakaba 1.1 ## Stay in the state
1002     !!!next-input-character;
1003     redo A;
1004     } elsif ($self->{nc} == 0x002F) { # /
1005 wakaba 1.11 if ($self->{is_xml}) {
1006     !!!cp (64);
1007     ## XML5: Not a parse error.
1008     !!!parse-error (type => 'no attr value'); ## TODO: type
1009     } else {
1010     !!!cp (64.1);
1011     }
1012    
1013 wakaba 1.1 $before_leave->();
1014     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1015     !!!next-input-character;
1016     redo A;
1017     } elsif ($self->{nc} == -1) {
1018     !!!parse-error (type => 'unclosed tag');
1019     $before_leave->();
1020     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1021     !!!cp (66);
1022     $self->{last_stag_name} = $self->{ct}->{tag_name};
1023     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1024     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1025     if ($self->{ct}->{attributes}) {
1026     !!!cp (67);
1027     !!!parse-error (type => 'end tag attribute');
1028     } else {
1029     ## NOTE: This state should never be reached.
1030     !!!cp (68);
1031     }
1032     } else {
1033     die "$0: $self->{ct}->{type}: Unknown token type";
1034     }
1035     $self->{state} = DATA_STATE;
1036 wakaba 1.5 $self->{s_kwd} = '';
1037 wakaba 1.1 # reconsume
1038    
1039     !!!emit ($self->{ct}); # start tag or end tag
1040    
1041     redo A;
1042     } else {
1043     if ($self->{nc} == 0x0022 or # "
1044     $self->{nc} == 0x0027) { # '
1045     !!!cp (69);
1046 wakaba 1.11 ## XML5: Not a parse error.
1047 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1048     } else {
1049     !!!cp (70);
1050     }
1051     $self->{ca}->{name} .= chr ($self->{nc});
1052     ## Stay in the state
1053     !!!next-input-character;
1054     redo A;
1055     }
1056     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1057 wakaba 1.11 ## XML5: "Tag attribute name after state".
1058    
1059 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1060     !!!cp (71);
1061     ## Stay in the state
1062     !!!next-input-character;
1063     redo A;
1064     } elsif ($self->{nc} == 0x003D) { # =
1065     !!!cp (72);
1066     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1067     !!!next-input-character;
1068     redo A;
1069     } elsif ($self->{nc} == 0x003E) { # >
1070 wakaba 1.11 if ($self->{is_xml}) {
1071     !!!cp (72.1);
1072     ## XML5: Not a parse error.
1073     !!!parse-error (type => 'no attr value'); ## TODO: type
1074     } else {
1075     !!!cp (72.2);
1076     }
1077    
1078 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1079     !!!cp (73);
1080     $self->{last_stag_name} = $self->{ct}->{tag_name};
1081     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1082     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1083     if ($self->{ct}->{attributes}) {
1084     !!!cp (74);
1085     !!!parse-error (type => 'end tag attribute');
1086     } else {
1087     ## NOTE: This state should never be reached.
1088     !!!cp (75);
1089     }
1090     } else {
1091     die "$0: $self->{ct}->{type}: Unknown token type";
1092     }
1093     $self->{state} = DATA_STATE;
1094 wakaba 1.5 $self->{s_kwd} = '';
1095 wakaba 1.1 !!!next-input-character;
1096    
1097     !!!emit ($self->{ct}); # start tag or end tag
1098    
1099     redo A;
1100     } elsif (0x0041 <= $self->{nc} and
1101     $self->{nc} <= 0x005A) { # A..Z
1102     !!!cp (76);
1103     $self->{ca}
1104 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1105 wakaba 1.1 value => '',
1106     line => $self->{line}, column => $self->{column}};
1107     $self->{state} = ATTRIBUTE_NAME_STATE;
1108     !!!next-input-character;
1109     redo A;
1110     } elsif ($self->{nc} == 0x002F) { # /
1111 wakaba 1.11 if ($self->{is_xml}) {
1112     !!!cp (77);
1113     ## XML5: Not a parse error.
1114     !!!parse-error (type => 'no attr value'); ## TODO: type
1115     } else {
1116     !!!cp (77.1);
1117     }
1118    
1119 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1120     !!!next-input-character;
1121     redo A;
1122     } elsif ($self->{nc} == -1) {
1123     !!!parse-error (type => 'unclosed tag');
1124     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1125     !!!cp (79);
1126     $self->{last_stag_name} = $self->{ct}->{tag_name};
1127     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1128     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1129     if ($self->{ct}->{attributes}) {
1130     !!!cp (80);
1131     !!!parse-error (type => 'end tag attribute');
1132     } else {
1133     ## NOTE: This state should never be reached.
1134     !!!cp (81);
1135     }
1136     } else {
1137     die "$0: $self->{ct}->{type}: Unknown token type";
1138     }
1139 wakaba 1.5 $self->{s_kwd} = '';
1140 wakaba 1.1 $self->{state} = DATA_STATE;
1141     # reconsume
1142    
1143     !!!emit ($self->{ct}); # start tag or end tag
1144    
1145     redo A;
1146     } else {
1147 wakaba 1.11 if ($self->{is_xml}) {
1148     !!!cp (78.1);
1149     ## XML5: Not a parse error.
1150     !!!parse-error (type => 'no attr value'); ## TODO: type
1151     } else {
1152     !!!cp (78.2);
1153     }
1154    
1155 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1156     $self->{nc} == 0x0027) { # '
1157     !!!cp (78);
1158 wakaba 1.11 ## XML5: Not a parse error.
1159 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1160     } else {
1161     !!!cp (82);
1162     }
1163     $self->{ca}
1164     = {name => chr ($self->{nc}),
1165     value => '',
1166     line => $self->{line}, column => $self->{column}};
1167     $self->{state} = ATTRIBUTE_NAME_STATE;
1168     !!!next-input-character;
1169     redo A;
1170     }
1171     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1172 wakaba 1.11 ## XML5: "Tag attribute value before state".
1173    
1174 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1175     !!!cp (83);
1176     ## Stay in the state
1177     !!!next-input-character;
1178     redo A;
1179     } elsif ($self->{nc} == 0x0022) { # "
1180     !!!cp (84);
1181     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1182     !!!next-input-character;
1183     redo A;
1184     } elsif ($self->{nc} == 0x0026) { # &
1185     !!!cp (85);
1186     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1187     ## reconsume
1188     redo A;
1189     } elsif ($self->{nc} == 0x0027) { # '
1190     !!!cp (86);
1191     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1192     !!!next-input-character;
1193     redo A;
1194     } elsif ($self->{nc} == 0x003E) { # >
1195     !!!parse-error (type => 'empty unquoted attribute value');
1196     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1197     !!!cp (87);
1198     $self->{last_stag_name} = $self->{ct}->{tag_name};
1199     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1200     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1201     if ($self->{ct}->{attributes}) {
1202     !!!cp (88);
1203     !!!parse-error (type => 'end tag attribute');
1204     } else {
1205     ## NOTE: This state should never be reached.
1206     !!!cp (89);
1207     }
1208     } else {
1209     die "$0: $self->{ct}->{type}: Unknown token type";
1210     }
1211     $self->{state} = DATA_STATE;
1212 wakaba 1.5 $self->{s_kwd} = '';
1213 wakaba 1.1 !!!next-input-character;
1214    
1215     !!!emit ($self->{ct}); # start tag or end tag
1216    
1217     redo A;
1218     } elsif ($self->{nc} == -1) {
1219     !!!parse-error (type => 'unclosed tag');
1220     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1221     !!!cp (90);
1222     $self->{last_stag_name} = $self->{ct}->{tag_name};
1223     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1224     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1225     if ($self->{ct}->{attributes}) {
1226     !!!cp (91);
1227     !!!parse-error (type => 'end tag attribute');
1228     } else {
1229     ## NOTE: This state should never be reached.
1230     !!!cp (92);
1231     }
1232     } else {
1233     die "$0: $self->{ct}->{type}: Unknown token type";
1234     }
1235     $self->{state} = DATA_STATE;
1236 wakaba 1.5 $self->{s_kwd} = '';
1237 wakaba 1.1 ## reconsume
1238    
1239     !!!emit ($self->{ct}); # start tag or end tag
1240    
1241     redo A;
1242     } else {
1243     if ($self->{nc} == 0x003D) { # =
1244     !!!cp (93);
1245 wakaba 1.11 ## XML5: Not a parse error.
1246 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1247 wakaba 1.11 } elsif ($self->{is_xml}) {
1248     !!!cp (93.1);
1249     ## XML5: No parse error.
1250     !!!parse-error (type => 'unquoted attr value'); ## TODO
1251 wakaba 1.1 } else {
1252     !!!cp (94);
1253     }
1254     $self->{ca}->{value} .= chr ($self->{nc});
1255     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1256     !!!next-input-character;
1257     redo A;
1258     }
1259     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1260 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1261    
1262 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1263     !!!cp (95);
1264 wakaba 1.11 ## XML5: "Tag attribute name before state".
1265 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1266     !!!next-input-character;
1267     redo A;
1268     } elsif ($self->{nc} == 0x0026) { # &
1269     !!!cp (96);
1270 wakaba 1.11 ## XML5: Not defined yet.
1271    
1272 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1273     ## "entity in attribute value state". In this implementation, the
1274     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1275     ## implementation of the "consume a character reference" algorithm.
1276     $self->{prev_state} = $self->{state};
1277     $self->{entity_add} = 0x0022; # "
1278     $self->{state} = ENTITY_STATE;
1279     !!!next-input-character;
1280     redo A;
1281     } elsif ($self->{nc} == -1) {
1282     !!!parse-error (type => 'unclosed attribute value');
1283     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1284     !!!cp (97);
1285     $self->{last_stag_name} = $self->{ct}->{tag_name};
1286     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1287     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1288     if ($self->{ct}->{attributes}) {
1289     !!!cp (98);
1290     !!!parse-error (type => 'end tag attribute');
1291     } else {
1292     ## NOTE: This state should never be reached.
1293     !!!cp (99);
1294     }
1295     } else {
1296     die "$0: $self->{ct}->{type}: Unknown token type";
1297     }
1298     $self->{state} = DATA_STATE;
1299 wakaba 1.5 $self->{s_kwd} = '';
1300 wakaba 1.1 ## reconsume
1301    
1302     !!!emit ($self->{ct}); # start tag or end tag
1303    
1304     redo A;
1305     } else {
1306 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1307     !!!cp (100);
1308     ## XML5: Not a parse error.
1309     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1310     } else {
1311     !!!cp (100.1);
1312     }
1313 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1314     $self->{read_until}->($self->{ca}->{value},
1315 wakaba 1.11 q["&<],
1316 wakaba 1.1 length $self->{ca}->{value});
1317    
1318     ## Stay in the state
1319     !!!next-input-character;
1320     redo A;
1321     }
1322     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1323 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1324    
1325 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1326     !!!cp (101);
1327 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1328 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1329     !!!next-input-character;
1330     redo A;
1331     } elsif ($self->{nc} == 0x0026) { # &
1332     !!!cp (102);
1333 wakaba 1.11 ## XML5: Not defined yet.
1334    
1335 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1336     ## "entity in attribute value state". In this implementation, the
1337     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1338     ## implementation of the "consume a character reference" algorithm.
1339     $self->{entity_add} = 0x0027; # '
1340     $self->{prev_state} = $self->{state};
1341     $self->{state} = ENTITY_STATE;
1342     !!!next-input-character;
1343     redo A;
1344     } elsif ($self->{nc} == -1) {
1345     !!!parse-error (type => 'unclosed attribute value');
1346     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1347     !!!cp (103);
1348     $self->{last_stag_name} = $self->{ct}->{tag_name};
1349     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1350     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1351     if ($self->{ct}->{attributes}) {
1352     !!!cp (104);
1353     !!!parse-error (type => 'end tag attribute');
1354     } else {
1355     ## NOTE: This state should never be reached.
1356     !!!cp (105);
1357     }
1358     } else {
1359     die "$0: $self->{ct}->{type}: Unknown token type";
1360     }
1361     $self->{state} = DATA_STATE;
1362 wakaba 1.5 $self->{s_kwd} = '';
1363 wakaba 1.1 ## reconsume
1364    
1365     !!!emit ($self->{ct}); # start tag or end tag
1366    
1367     redo A;
1368     } else {
1369 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1370     !!!cp (106);
1371     ## XML5: Not a parse error.
1372     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1373     } else {
1374     !!!cp (106.1);
1375     }
1376 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1377     $self->{read_until}->($self->{ca}->{value},
1378 wakaba 1.11 q['&<],
1379 wakaba 1.1 length $self->{ca}->{value});
1380    
1381     ## Stay in the state
1382     !!!next-input-character;
1383     redo A;
1384     }
1385     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1386 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1387    
1388 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1389     !!!cp (107);
1390 wakaba 1.11 ## XML5: "Tag attribute name before state".
1391 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1392     !!!next-input-character;
1393     redo A;
1394     } elsif ($self->{nc} == 0x0026) { # &
1395     !!!cp (108);
1396 wakaba 1.11
1397     ## XML5: Not defined yet.
1398    
1399 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1400     ## "entity in attribute value state". In this implementation, the
1401     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1402     ## implementation of the "consume a character reference" algorithm.
1403     $self->{entity_add} = -1;
1404     $self->{prev_state} = $self->{state};
1405     $self->{state} = ENTITY_STATE;
1406     !!!next-input-character;
1407     redo A;
1408     } elsif ($self->{nc} == 0x003E) { # >
1409     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1410     !!!cp (109);
1411     $self->{last_stag_name} = $self->{ct}->{tag_name};
1412     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1413     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1414     if ($self->{ct}->{attributes}) {
1415     !!!cp (110);
1416     !!!parse-error (type => 'end tag attribute');
1417     } else {
1418     ## NOTE: This state should never be reached.
1419     !!!cp (111);
1420     }
1421     } else {
1422     die "$0: $self->{ct}->{type}: Unknown token type";
1423     }
1424     $self->{state} = DATA_STATE;
1425 wakaba 1.5 $self->{s_kwd} = '';
1426 wakaba 1.1 !!!next-input-character;
1427    
1428     !!!emit ($self->{ct}); # start tag or end tag
1429    
1430     redo A;
1431     } elsif ($self->{nc} == -1) {
1432     !!!parse-error (type => 'unclosed tag');
1433     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1434     !!!cp (112);
1435     $self->{last_stag_name} = $self->{ct}->{tag_name};
1436     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1437     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438     if ($self->{ct}->{attributes}) {
1439     !!!cp (113);
1440     !!!parse-error (type => 'end tag attribute');
1441     } else {
1442     ## NOTE: This state should never be reached.
1443     !!!cp (114);
1444     }
1445     } else {
1446     die "$0: $self->{ct}->{type}: Unknown token type";
1447     }
1448     $self->{state} = DATA_STATE;
1449 wakaba 1.5 $self->{s_kwd} = '';
1450 wakaba 1.1 ## reconsume
1451    
1452     !!!emit ($self->{ct}); # start tag or end tag
1453    
1454     redo A;
1455     } else {
1456     if ({
1457     0x0022 => 1, # "
1458     0x0027 => 1, # '
1459     0x003D => 1, # =
1460     }->{$self->{nc}}) {
1461     !!!cp (115);
1462 wakaba 1.11 ## XML5: Not a parse error.
1463 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1464     } else {
1465     !!!cp (116);
1466     }
1467     $self->{ca}->{value} .= chr ($self->{nc});
1468     $self->{read_until}->($self->{ca}->{value},
1469     q["'=& >],
1470     length $self->{ca}->{value});
1471    
1472     ## Stay in the state
1473     !!!next-input-character;
1474     redo A;
1475     }
1476     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1477     if ($is_space->{$self->{nc}}) {
1478     !!!cp (118);
1479     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1480     !!!next-input-character;
1481     redo A;
1482     } elsif ($self->{nc} == 0x003E) { # >
1483     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1484     !!!cp (119);
1485     $self->{last_stag_name} = $self->{ct}->{tag_name};
1486     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1487     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488     if ($self->{ct}->{attributes}) {
1489     !!!cp (120);
1490     !!!parse-error (type => 'end tag attribute');
1491     } else {
1492     ## NOTE: This state should never be reached.
1493     !!!cp (121);
1494     }
1495     } else {
1496     die "$0: $self->{ct}->{type}: Unknown token type";
1497     }
1498     $self->{state} = DATA_STATE;
1499 wakaba 1.5 $self->{s_kwd} = '';
1500 wakaba 1.1 !!!next-input-character;
1501    
1502     !!!emit ($self->{ct}); # start tag or end tag
1503    
1504     redo A;
1505     } elsif ($self->{nc} == 0x002F) { # /
1506     !!!cp (122);
1507     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1508     !!!next-input-character;
1509     redo A;
1510     } elsif ($self->{nc} == -1) {
1511     !!!parse-error (type => 'unclosed tag');
1512     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1513     !!!cp (122.3);
1514     $self->{last_stag_name} = $self->{ct}->{tag_name};
1515     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1516     if ($self->{ct}->{attributes}) {
1517     !!!cp (122.1);
1518     !!!parse-error (type => 'end tag attribute');
1519     } else {
1520     ## NOTE: This state should never be reached.
1521     !!!cp (122.2);
1522     }
1523     } else {
1524     die "$0: $self->{ct}->{type}: Unknown token type";
1525     }
1526     $self->{state} = DATA_STATE;
1527 wakaba 1.5 $self->{s_kwd} = '';
1528 wakaba 1.1 ## Reconsume.
1529     !!!emit ($self->{ct}); # start tag or end tag
1530     redo A;
1531     } else {
1532     !!!cp ('124.1');
1533     !!!parse-error (type => 'no space between attributes');
1534     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1535     ## reconsume
1536     redo A;
1537     }
1538     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1539 wakaba 1.11 ## XML5: "Empty tag state".
1540    
1541 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1542     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1543     !!!cp ('124.2');
1544     !!!parse-error (type => 'nestc', token => $self->{ct});
1545     ## TODO: Different type than slash in start tag
1546     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1547     if ($self->{ct}->{attributes}) {
1548     !!!cp ('124.4');
1549     !!!parse-error (type => 'end tag attribute');
1550     } else {
1551     !!!cp ('124.5');
1552     }
1553     ## TODO: Test |<title></title/>|
1554     } else {
1555     !!!cp ('124.3');
1556     $self->{self_closing} = 1;
1557     }
1558    
1559     $self->{state} = DATA_STATE;
1560 wakaba 1.5 $self->{s_kwd} = '';
1561 wakaba 1.1 !!!next-input-character;
1562    
1563     !!!emit ($self->{ct}); # start tag or end tag
1564    
1565     redo A;
1566     } elsif ($self->{nc} == -1) {
1567     !!!parse-error (type => 'unclosed tag');
1568     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1569     !!!cp (124.7);
1570     $self->{last_stag_name} = $self->{ct}->{tag_name};
1571     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1572     if ($self->{ct}->{attributes}) {
1573     !!!cp (124.5);
1574     !!!parse-error (type => 'end tag attribute');
1575     } else {
1576     ## NOTE: This state should never be reached.
1577     !!!cp (124.6);
1578     }
1579     } else {
1580     die "$0: $self->{ct}->{type}: Unknown token type";
1581     }
1582 wakaba 1.11 ## XML5: "Tag attribute name before state".
1583 wakaba 1.1 $self->{state} = DATA_STATE;
1584 wakaba 1.5 $self->{s_kwd} = '';
1585 wakaba 1.1 ## Reconsume.
1586     !!!emit ($self->{ct}); # start tag or end tag
1587     redo A;
1588     } else {
1589     !!!cp ('124.4');
1590     !!!parse-error (type => 'nestc');
1591     ## TODO: This error type is wrong.
1592     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1593     ## Reconsume.
1594     redo A;
1595     }
1596     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1597 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1598    
1599 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
1600     ## consumes characters one-by-one basis.
1601    
1602     if ($self->{nc} == 0x003E) { # >
1603 wakaba 1.13 if ($self->{in_subset}) {
1604     !!!cp (123);
1605     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1606     } else {
1607     !!!cp (124);
1608     $self->{state} = DATA_STATE;
1609     $self->{s_kwd} = '';
1610     }
1611 wakaba 1.1 !!!next-input-character;
1612    
1613     !!!emit ($self->{ct}); # comment
1614     redo A;
1615     } elsif ($self->{nc} == -1) {
1616 wakaba 1.13 if ($self->{in_subset}) {
1617     !!!cp (125.1);
1618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1619     } else {
1620     !!!cp (125);
1621     $self->{state} = DATA_STATE;
1622     $self->{s_kwd} = '';
1623     }
1624 wakaba 1.1 ## reconsume
1625    
1626     !!!emit ($self->{ct}); # comment
1627     redo A;
1628     } else {
1629     !!!cp (126);
1630     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1631     $self->{read_until}->($self->{ct}->{data},
1632     q[>],
1633     length $self->{ct}->{data});
1634    
1635     ## Stay in the state.
1636     !!!next-input-character;
1637     redo A;
1638     }
1639     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1640 wakaba 1.14 ## XML5: "Markup declaration state".
1641 wakaba 1.1
1642     if ($self->{nc} == 0x002D) { # -
1643     !!!cp (133);
1644     $self->{state} = MD_HYPHEN_STATE;
1645     !!!next-input-character;
1646     redo A;
1647     } elsif ($self->{nc} == 0x0044 or # D
1648     $self->{nc} == 0x0064) { # d
1649     ## ASCII case-insensitive.
1650     !!!cp (130);
1651     $self->{state} = MD_DOCTYPE_STATE;
1652 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1653 wakaba 1.1 !!!next-input-character;
1654     redo A;
1655 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1656     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1657     $self->{is_xml}) and
1658 wakaba 1.1 $self->{nc} == 0x005B) { # [
1659     !!!cp (135.4);
1660     $self->{state} = MD_CDATA_STATE;
1661 wakaba 1.12 $self->{kwd} = '[';
1662 wakaba 1.1 !!!next-input-character;
1663     redo A;
1664     } else {
1665     !!!cp (136);
1666     }
1667    
1668     !!!parse-error (type => 'bogus comment',
1669     line => $self->{line_prev},
1670     column => $self->{column_prev} - 1);
1671     ## Reconsume.
1672     $self->{state} = BOGUS_COMMENT_STATE;
1673     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1674     line => $self->{line_prev},
1675     column => $self->{column_prev} - 1,
1676     };
1677     redo A;
1678     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1679     if ($self->{nc} == 0x002D) { # -
1680     !!!cp (127);
1681     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1682     line => $self->{line_prev},
1683     column => $self->{column_prev} - 2,
1684     };
1685 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1686 wakaba 1.1 !!!next-input-character;
1687     redo A;
1688     } else {
1689     !!!cp (128);
1690     !!!parse-error (type => 'bogus comment',
1691     line => $self->{line_prev},
1692     column => $self->{column_prev} - 2);
1693     $self->{state} = BOGUS_COMMENT_STATE;
1694     ## Reconsume.
1695     $self->{ct} = {type => COMMENT_TOKEN,
1696     data => '-',
1697     line => $self->{line_prev},
1698     column => $self->{column_prev} - 2,
1699     };
1700     redo A;
1701     }
1702     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1703     ## ASCII case-insensitive.
1704     if ($self->{nc} == [
1705     undef,
1706     0x004F, # O
1707     0x0043, # C
1708     0x0054, # T
1709     0x0059, # Y
1710     0x0050, # P
1711 wakaba 1.12 ]->[length $self->{kwd}] or
1712 wakaba 1.1 $self->{nc} == [
1713     undef,
1714     0x006F, # o
1715     0x0063, # c
1716     0x0074, # t
1717     0x0079, # y
1718     0x0070, # p
1719 wakaba 1.12 ]->[length $self->{kwd}]) {
1720 wakaba 1.1 !!!cp (131);
1721     ## Stay in the state.
1722 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1723 wakaba 1.1 !!!next-input-character;
1724     redo A;
1725 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1726 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1727     $self->{nc} == 0x0065)) { # e
1728 wakaba 1.12 if ($self->{is_xml} and
1729     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1730 wakaba 1.10 !!!cp (129);
1731     ## XML5: case-sensitive.
1732     !!!parse-error (type => 'lowercase keyword', ## TODO
1733     text => 'DOCTYPE',
1734     line => $self->{line_prev},
1735     column => $self->{column_prev} - 5);
1736     } else {
1737     !!!cp (129.1);
1738     }
1739 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1740     $self->{ct} = {type => DOCTYPE_TOKEN,
1741     quirks => 1,
1742     line => $self->{line_prev},
1743     column => $self->{column_prev} - 7,
1744     };
1745     !!!next-input-character;
1746     redo A;
1747     } else {
1748     !!!cp (132);
1749     !!!parse-error (type => 'bogus comment',
1750     line => $self->{line_prev},
1751 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1752 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1753     ## Reconsume.
1754     $self->{ct} = {type => COMMENT_TOKEN,
1755 wakaba 1.12 data => $self->{kwd},
1756 wakaba 1.1 line => $self->{line_prev},
1757 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1758 wakaba 1.1 };
1759     redo A;
1760     }
1761     } elsif ($self->{state} == MD_CDATA_STATE) {
1762     if ($self->{nc} == {
1763     '[' => 0x0043, # C
1764     '[C' => 0x0044, # D
1765     '[CD' => 0x0041, # A
1766     '[CDA' => 0x0054, # T
1767     '[CDAT' => 0x0041, # A
1768 wakaba 1.12 }->{$self->{kwd}}) {
1769 wakaba 1.1 !!!cp (135.1);
1770     ## Stay in the state.
1771 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1772 wakaba 1.1 !!!next-input-character;
1773     redo A;
1774 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1775 wakaba 1.1 $self->{nc} == 0x005B) { # [
1776 wakaba 1.6 if ($self->{is_xml} and
1777     not $self->{tainted} and
1778     @{$self->{open_elements} or []} == 0) {
1779 wakaba 1.8 !!!cp (135.2);
1780 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1781     line => $self->{line_prev},
1782     column => $self->{column_prev} - 7);
1783     $self->{tainted} = 1;
1784 wakaba 1.8 } else {
1785     !!!cp (135.21);
1786 wakaba 1.6 }
1787    
1788 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1789     data => '',
1790     line => $self->{line_prev},
1791     column => $self->{column_prev} - 7};
1792     $self->{state} = CDATA_SECTION_STATE;
1793     !!!next-input-character;
1794     redo A;
1795     } else {
1796     !!!cp (135.3);
1797     !!!parse-error (type => 'bogus comment',
1798     line => $self->{line_prev},
1799 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1800 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1801     ## Reconsume.
1802     $self->{ct} = {type => COMMENT_TOKEN,
1803 wakaba 1.12 data => $self->{kwd},
1804 wakaba 1.1 line => $self->{line_prev},
1805 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1806 wakaba 1.1 };
1807     redo A;
1808     }
1809     } elsif ($self->{state} == COMMENT_START_STATE) {
1810     if ($self->{nc} == 0x002D) { # -
1811     !!!cp (137);
1812     $self->{state} = COMMENT_START_DASH_STATE;
1813     !!!next-input-character;
1814     redo A;
1815     } elsif ($self->{nc} == 0x003E) { # >
1816     !!!parse-error (type => 'bogus comment');
1817 wakaba 1.13 if ($self->{in_subset}) {
1818     !!!cp (138.1);
1819     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1820     } else {
1821     !!!cp (138);
1822     $self->{state} = DATA_STATE;
1823     $self->{s_kwd} = '';
1824     }
1825 wakaba 1.1 !!!next-input-character;
1826    
1827     !!!emit ($self->{ct}); # comment
1828    
1829     redo A;
1830     } elsif ($self->{nc} == -1) {
1831     !!!parse-error (type => 'unclosed comment');
1832 wakaba 1.13 if ($self->{in_subset}) {
1833     !!!cp (139.1);
1834     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1835     } else {
1836     !!!cp (139);
1837     $self->{state} = DATA_STATE;
1838     $self->{s_kwd} = '';
1839     }
1840 wakaba 1.1 ## reconsume
1841    
1842     !!!emit ($self->{ct}); # comment
1843    
1844     redo A;
1845     } else {
1846     !!!cp (140);
1847     $self->{ct}->{data} # comment
1848     .= chr ($self->{nc});
1849     $self->{state} = COMMENT_STATE;
1850     !!!next-input-character;
1851     redo A;
1852     }
1853     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1854     if ($self->{nc} == 0x002D) { # -
1855     !!!cp (141);
1856     $self->{state} = COMMENT_END_STATE;
1857     !!!next-input-character;
1858     redo A;
1859     } elsif ($self->{nc} == 0x003E) { # >
1860     !!!parse-error (type => 'bogus comment');
1861 wakaba 1.13 if ($self->{in_subset}) {
1862     !!!cp (142.1);
1863     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1864     } else {
1865     !!!cp (142);
1866     $self->{state} = DATA_STATE;
1867     $self->{s_kwd} = '';
1868     }
1869 wakaba 1.1 !!!next-input-character;
1870    
1871     !!!emit ($self->{ct}); # comment
1872    
1873     redo A;
1874     } elsif ($self->{nc} == -1) {
1875     !!!parse-error (type => 'unclosed comment');
1876 wakaba 1.13 if ($self->{in_subset}) {
1877     !!!cp (143.1);
1878     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1879     } else {
1880     !!!cp (143);
1881     $self->{state} = DATA_STATE;
1882     $self->{s_kwd} = '';
1883     }
1884 wakaba 1.1 ## reconsume
1885    
1886     !!!emit ($self->{ct}); # comment
1887    
1888     redo A;
1889     } else {
1890     !!!cp (144);
1891     $self->{ct}->{data} # comment
1892     .= '-' . chr ($self->{nc});
1893     $self->{state} = COMMENT_STATE;
1894     !!!next-input-character;
1895     redo A;
1896     }
1897     } elsif ($self->{state} == COMMENT_STATE) {
1898 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
1899    
1900 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1901     !!!cp (145);
1902     $self->{state} = COMMENT_END_DASH_STATE;
1903     !!!next-input-character;
1904     redo A;
1905     } elsif ($self->{nc} == -1) {
1906     !!!parse-error (type => 'unclosed comment');
1907 wakaba 1.13 if ($self->{in_subset}) {
1908     !!!cp (146.1);
1909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1910     } else {
1911     !!!cp (146);
1912     $self->{state} = DATA_STATE;
1913     $self->{s_kwd} = '';
1914     }
1915 wakaba 1.1 ## reconsume
1916    
1917     !!!emit ($self->{ct}); # comment
1918    
1919     redo A;
1920     } else {
1921     !!!cp (147);
1922     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1923     $self->{read_until}->($self->{ct}->{data},
1924     q[-],
1925     length $self->{ct}->{data});
1926    
1927     ## Stay in the state
1928     !!!next-input-character;
1929     redo A;
1930     }
1931     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1932 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
1933 wakaba 1.10
1934 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1935     !!!cp (148);
1936     $self->{state} = COMMENT_END_STATE;
1937     !!!next-input-character;
1938     redo A;
1939     } elsif ($self->{nc} == -1) {
1940     !!!parse-error (type => 'unclosed comment');
1941 wakaba 1.13 if ($self->{in_subset}) {
1942     !!!cp (149.1);
1943     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944     } else {
1945     !!!cp (149);
1946     $self->{state} = DATA_STATE;
1947     $self->{s_kwd} = '';
1948     }
1949 wakaba 1.1 ## reconsume
1950    
1951     !!!emit ($self->{ct}); # comment
1952    
1953     redo A;
1954     } else {
1955     !!!cp (150);
1956     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1957     $self->{state} = COMMENT_STATE;
1958     !!!next-input-character;
1959     redo A;
1960     }
1961     } elsif ($self->{state} == COMMENT_END_STATE) {
1962 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
1963    
1964 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1965 wakaba 1.13 if ($self->{in_subset}) {
1966     !!!cp (151.1);
1967     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1968     } else {
1969     !!!cp (151);
1970     $self->{state} = DATA_STATE;
1971     $self->{s_kwd} = '';
1972     }
1973 wakaba 1.1 !!!next-input-character;
1974    
1975     !!!emit ($self->{ct}); # comment
1976    
1977     redo A;
1978     } elsif ($self->{nc} == 0x002D) { # -
1979     !!!cp (152);
1980 wakaba 1.10 ## XML5: Not a parse error.
1981 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1982     line => $self->{line_prev},
1983     column => $self->{column_prev});
1984     $self->{ct}->{data} .= '-'; # comment
1985     ## Stay in the state
1986     !!!next-input-character;
1987     redo A;
1988     } elsif ($self->{nc} == -1) {
1989     !!!parse-error (type => 'unclosed comment');
1990 wakaba 1.13 if ($self->{in_subset}) {
1991     !!!cp (153.1);
1992     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1993     } else {
1994     !!!cp (153);
1995     $self->{state} = DATA_STATE;
1996     $self->{s_kwd} = '';
1997     }
1998 wakaba 1.1 ## reconsume
1999    
2000     !!!emit ($self->{ct}); # comment
2001    
2002     redo A;
2003     } else {
2004     !!!cp (154);
2005 wakaba 1.10 ## XML5: Not a parse error.
2006 wakaba 1.1 !!!parse-error (type => 'dash in comment',
2007     line => $self->{line_prev},
2008     column => $self->{column_prev});
2009     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2010     $self->{state} = COMMENT_STATE;
2011     !!!next-input-character;
2012     redo A;
2013     }
2014     } elsif ($self->{state} == DOCTYPE_STATE) {
2015     if ($is_space->{$self->{nc}}) {
2016     !!!cp (155);
2017     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2018     !!!next-input-character;
2019     redo A;
2020     } else {
2021     !!!cp (156);
2022 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2023 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
2024     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2025     ## reconsume
2026     redo A;
2027     }
2028     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2029 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2030    
2031 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2032     !!!cp (157);
2033     ## Stay in the state
2034     !!!next-input-character;
2035     redo A;
2036     } elsif ($self->{nc} == 0x003E) { # >
2037     !!!cp (158);
2038 wakaba 1.12 ## XML5: No parse error.
2039 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
2040     $self->{state} = DATA_STATE;
2041 wakaba 1.5 $self->{s_kwd} = '';
2042 wakaba 1.1 !!!next-input-character;
2043    
2044     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2045    
2046     redo A;
2047     } elsif ($self->{nc} == -1) {
2048     !!!cp (159);
2049     !!!parse-error (type => 'no DOCTYPE name');
2050     $self->{state} = DATA_STATE;
2051 wakaba 1.5 $self->{s_kwd} = '';
2052 wakaba 1.1 ## reconsume
2053    
2054     !!!emit ($self->{ct}); # DOCTYPE (quirks)
2055    
2056     redo A;
2057 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2058     !!!cp (159.1);
2059     !!!parse-error (type => 'no DOCTYPE name');
2060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2062     $self->{in_subset} = 1;
2063 wakaba 1.12 !!!next-input-character;
2064 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2065 wakaba 1.12 redo A;
2066 wakaba 1.1 } else {
2067     !!!cp (160);
2068     $self->{ct}->{name} = chr $self->{nc};
2069     delete $self->{ct}->{quirks};
2070     $self->{state} = DOCTYPE_NAME_STATE;
2071     !!!next-input-character;
2072     redo A;
2073     }
2074     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2075 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2076    
2077     ## ISSUE: Redundant "First," in the spec.
2078    
2079 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2080     !!!cp (161);
2081     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2082     !!!next-input-character;
2083     redo A;
2084     } elsif ($self->{nc} == 0x003E) { # >
2085     !!!cp (162);
2086     $self->{state} = DATA_STATE;
2087 wakaba 1.5 $self->{s_kwd} = '';
2088 wakaba 1.1 !!!next-input-character;
2089    
2090     !!!emit ($self->{ct}); # DOCTYPE
2091    
2092     redo A;
2093     } elsif ($self->{nc} == -1) {
2094     !!!cp (163);
2095     !!!parse-error (type => 'unclosed DOCTYPE');
2096     $self->{state} = DATA_STATE;
2097 wakaba 1.5 $self->{s_kwd} = '';
2098 wakaba 1.1 ## reconsume
2099    
2100     $self->{ct}->{quirks} = 1;
2101     !!!emit ($self->{ct}); # DOCTYPE
2102    
2103     redo A;
2104 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2105     !!!cp (163.1);
2106     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2107 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2108     $self->{in_subset} = 1;
2109 wakaba 1.12 !!!next-input-character;
2110 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2111 wakaba 1.12 redo A;
2112 wakaba 1.1 } else {
2113     !!!cp (164);
2114     $self->{ct}->{name}
2115     .= chr ($self->{nc}); # DOCTYPE
2116     ## Stay in the state
2117     !!!next-input-character;
2118     redo A;
2119     }
2120     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2121 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2122     ## state", but implemented differently.
2123    
2124 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2125     !!!cp (165);
2126     ## Stay in the state
2127     !!!next-input-character;
2128     redo A;
2129     } elsif ($self->{nc} == 0x003E) { # >
2130     !!!cp (166);
2131     $self->{state} = DATA_STATE;
2132 wakaba 1.5 $self->{s_kwd} = '';
2133 wakaba 1.1 !!!next-input-character;
2134    
2135     !!!emit ($self->{ct}); # DOCTYPE
2136    
2137     redo A;
2138     } elsif ($self->{nc} == -1) {
2139     !!!cp (167);
2140     !!!parse-error (type => 'unclosed DOCTYPE');
2141     $self->{state} = DATA_STATE;
2142 wakaba 1.5 $self->{s_kwd} = '';
2143 wakaba 1.1 ## reconsume
2144    
2145     $self->{ct}->{quirks} = 1;
2146     !!!emit ($self->{ct}); # DOCTYPE
2147    
2148     redo A;
2149     } elsif ($self->{nc} == 0x0050 or # P
2150     $self->{nc} == 0x0070) { # p
2151 wakaba 1.12 !!!cp (167.1);
2152 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2153 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2154 wakaba 1.1 !!!next-input-character;
2155     redo A;
2156     } elsif ($self->{nc} == 0x0053 or # S
2157     $self->{nc} == 0x0073) { # s
2158 wakaba 1.12 !!!cp (167.2);
2159 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2160 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2161     !!!next-input-character;
2162     redo A;
2163     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2164     !!!cp (167.3);
2165     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2166     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2167 wakaba 1.13 $self->{in_subset} = 1;
2168 wakaba 1.1 !!!next-input-character;
2169 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2170 wakaba 1.1 redo A;
2171     } else {
2172     !!!cp (180);
2173     !!!parse-error (type => 'string after DOCTYPE name');
2174     $self->{ct}->{quirks} = 1;
2175    
2176     $self->{state} = BOGUS_DOCTYPE_STATE;
2177     !!!next-input-character;
2178     redo A;
2179     }
2180     } elsif ($self->{state} == PUBLIC_STATE) {
2181     ## ASCII case-insensitive
2182     if ($self->{nc} == [
2183     undef,
2184     0x0055, # U
2185     0x0042, # B
2186     0x004C, # L
2187     0x0049, # I
2188 wakaba 1.12 ]->[length $self->{kwd}] or
2189 wakaba 1.1 $self->{nc} == [
2190     undef,
2191     0x0075, # u
2192     0x0062, # b
2193     0x006C, # l
2194     0x0069, # i
2195 wakaba 1.12 ]->[length $self->{kwd}]) {
2196 wakaba 1.1 !!!cp (175);
2197     ## Stay in the state.
2198 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2199 wakaba 1.1 !!!next-input-character;
2200     redo A;
2201 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2202 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2203     $self->{nc} == 0x0063)) { # c
2204 wakaba 1.12 if ($self->{is_xml} and
2205     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2206     !!!cp (168.1);
2207     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2208     text => 'PUBLIC',
2209     line => $self->{line_prev},
2210     column => $self->{column_prev} - 4);
2211     } else {
2212     !!!cp (168);
2213     }
2214 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2215     !!!next-input-character;
2216     redo A;
2217     } else {
2218     !!!cp (169);
2219     !!!parse-error (type => 'string after DOCTYPE name',
2220     line => $self->{line_prev},
2221 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2222 wakaba 1.1 $self->{ct}->{quirks} = 1;
2223    
2224     $self->{state} = BOGUS_DOCTYPE_STATE;
2225     ## Reconsume.
2226     redo A;
2227     }
2228     } elsif ($self->{state} == SYSTEM_STATE) {
2229     ## ASCII case-insensitive
2230     if ($self->{nc} == [
2231     undef,
2232     0x0059, # Y
2233     0x0053, # S
2234     0x0054, # T
2235     0x0045, # E
2236 wakaba 1.12 ]->[length $self->{kwd}] or
2237 wakaba 1.1 $self->{nc} == [
2238     undef,
2239     0x0079, # y
2240     0x0073, # s
2241     0x0074, # t
2242     0x0065, # e
2243 wakaba 1.12 ]->[length $self->{kwd}]) {
2244 wakaba 1.1 !!!cp (170);
2245     ## Stay in the state.
2246 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2247 wakaba 1.1 !!!next-input-character;
2248     redo A;
2249 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2250 wakaba 1.1 ($self->{nc} == 0x004D or # M
2251     $self->{nc} == 0x006D)) { # m
2252 wakaba 1.12 if ($self->{is_xml} and
2253     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2254     !!!cp (171.1);
2255     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2256     text => 'SYSTEM',
2257     line => $self->{line_prev},
2258     column => $self->{column_prev} - 4);
2259     } else {
2260     !!!cp (171);
2261     }
2262 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2263     !!!next-input-character;
2264     redo A;
2265     } else {
2266     !!!cp (172);
2267     !!!parse-error (type => 'string after DOCTYPE name',
2268     line => $self->{line_prev},
2269 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2270 wakaba 1.1 $self->{ct}->{quirks} = 1;
2271    
2272     $self->{state} = BOGUS_DOCTYPE_STATE;
2273     ## Reconsume.
2274     redo A;
2275     }
2276     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2277     if ($is_space->{$self->{nc}}) {
2278     !!!cp (181);
2279     ## Stay in the state
2280     !!!next-input-character;
2281     redo A;
2282     } elsif ($self->{nc} eq 0x0022) { # "
2283     !!!cp (182);
2284     $self->{ct}->{pubid} = ''; # DOCTYPE
2285     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2286     !!!next-input-character;
2287     redo A;
2288     } elsif ($self->{nc} eq 0x0027) { # '
2289     !!!cp (183);
2290     $self->{ct}->{pubid} = ''; # DOCTYPE
2291     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2292     !!!next-input-character;
2293     redo A;
2294     } elsif ($self->{nc} eq 0x003E) { # >
2295     !!!cp (184);
2296     !!!parse-error (type => 'no PUBLIC literal');
2297    
2298     $self->{state} = DATA_STATE;
2299 wakaba 1.5 $self->{s_kwd} = '';
2300 wakaba 1.1 !!!next-input-character;
2301    
2302     $self->{ct}->{quirks} = 1;
2303     !!!emit ($self->{ct}); # DOCTYPE
2304    
2305     redo A;
2306     } elsif ($self->{nc} == -1) {
2307     !!!cp (185);
2308     !!!parse-error (type => 'unclosed DOCTYPE');
2309    
2310     $self->{state} = DATA_STATE;
2311 wakaba 1.5 $self->{s_kwd} = '';
2312 wakaba 1.1 ## reconsume
2313    
2314     $self->{ct}->{quirks} = 1;
2315     !!!emit ($self->{ct}); # DOCTYPE
2316    
2317     redo A;
2318 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2319     !!!cp (186.1);
2320     !!!parse-error (type => 'no PUBLIC literal');
2321     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2322     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2323 wakaba 1.13 $self->{in_subset} = 1;
2324 wakaba 1.12 !!!next-input-character;
2325 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2326 wakaba 1.12 redo A;
2327 wakaba 1.1 } else {
2328     !!!cp (186);
2329     !!!parse-error (type => 'string after PUBLIC');
2330     $self->{ct}->{quirks} = 1;
2331    
2332     $self->{state} = BOGUS_DOCTYPE_STATE;
2333     !!!next-input-character;
2334     redo A;
2335     }
2336     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2337     if ($self->{nc} == 0x0022) { # "
2338     !!!cp (187);
2339     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2340     !!!next-input-character;
2341     redo A;
2342     } elsif ($self->{nc} == 0x003E) { # >
2343     !!!cp (188);
2344     !!!parse-error (type => 'unclosed PUBLIC literal');
2345    
2346     $self->{state} = DATA_STATE;
2347 wakaba 1.5 $self->{s_kwd} = '';
2348 wakaba 1.1 !!!next-input-character;
2349    
2350     $self->{ct}->{quirks} = 1;
2351     !!!emit ($self->{ct}); # DOCTYPE
2352    
2353     redo A;
2354     } elsif ($self->{nc} == -1) {
2355     !!!cp (189);
2356     !!!parse-error (type => 'unclosed PUBLIC literal');
2357    
2358     $self->{state} = DATA_STATE;
2359 wakaba 1.5 $self->{s_kwd} = '';
2360 wakaba 1.1 ## reconsume
2361    
2362     $self->{ct}->{quirks} = 1;
2363     !!!emit ($self->{ct}); # DOCTYPE
2364    
2365     redo A;
2366     } else {
2367     !!!cp (190);
2368     $self->{ct}->{pubid} # DOCTYPE
2369     .= chr $self->{nc};
2370     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2371     length $self->{ct}->{pubid});
2372    
2373     ## Stay in the state
2374     !!!next-input-character;
2375     redo A;
2376     }
2377     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2378     if ($self->{nc} == 0x0027) { # '
2379     !!!cp (191);
2380     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2381     !!!next-input-character;
2382     redo A;
2383     } elsif ($self->{nc} == 0x003E) { # >
2384     !!!cp (192);
2385     !!!parse-error (type => 'unclosed PUBLIC literal');
2386    
2387     $self->{state} = DATA_STATE;
2388 wakaba 1.5 $self->{s_kwd} = '';
2389 wakaba 1.1 !!!next-input-character;
2390    
2391     $self->{ct}->{quirks} = 1;
2392     !!!emit ($self->{ct}); # DOCTYPE
2393    
2394     redo A;
2395     } elsif ($self->{nc} == -1) {
2396     !!!cp (193);
2397     !!!parse-error (type => 'unclosed PUBLIC literal');
2398    
2399     $self->{state} = DATA_STATE;
2400 wakaba 1.5 $self->{s_kwd} = '';
2401 wakaba 1.1 ## reconsume
2402    
2403     $self->{ct}->{quirks} = 1;
2404     !!!emit ($self->{ct}); # DOCTYPE
2405    
2406     redo A;
2407     } else {
2408     !!!cp (194);
2409     $self->{ct}->{pubid} # DOCTYPE
2410     .= chr $self->{nc};
2411     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2412     length $self->{ct}->{pubid});
2413    
2414     ## Stay in the state
2415     !!!next-input-character;
2416     redo A;
2417     }
2418     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2419     if ($is_space->{$self->{nc}}) {
2420     !!!cp (195);
2421     ## Stay in the state
2422     !!!next-input-character;
2423     redo A;
2424     } elsif ($self->{nc} == 0x0022) { # "
2425     !!!cp (196);
2426     $self->{ct}->{sysid} = ''; # DOCTYPE
2427     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2428     !!!next-input-character;
2429     redo A;
2430     } elsif ($self->{nc} == 0x0027) { # '
2431     !!!cp (197);
2432     $self->{ct}->{sysid} = ''; # DOCTYPE
2433     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2434     !!!next-input-character;
2435     redo A;
2436     } elsif ($self->{nc} == 0x003E) { # >
2437 wakaba 1.12 if ($self->{is_xml}) {
2438     !!!cp (198.1);
2439     !!!parse-error (type => 'no SYSTEM literal');
2440     } else {
2441     !!!cp (198);
2442     }
2443 wakaba 1.1 $self->{state} = DATA_STATE;
2444 wakaba 1.5 $self->{s_kwd} = '';
2445 wakaba 1.1 !!!next-input-character;
2446    
2447     !!!emit ($self->{ct}); # DOCTYPE
2448    
2449     redo A;
2450     } elsif ($self->{nc} == -1) {
2451     !!!cp (199);
2452     !!!parse-error (type => 'unclosed DOCTYPE');
2453    
2454     $self->{state} = DATA_STATE;
2455 wakaba 1.5 $self->{s_kwd} = '';
2456 wakaba 1.1 ## reconsume
2457    
2458     $self->{ct}->{quirks} = 1;
2459     !!!emit ($self->{ct}); # DOCTYPE
2460    
2461     redo A;
2462 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2463     !!!cp (200.1);
2464     !!!parse-error (type => 'no SYSTEM literal');
2465     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2466     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2467 wakaba 1.13 $self->{in_subset} = 1;
2468 wakaba 1.12 !!!next-input-character;
2469 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2470 wakaba 1.12 redo A;
2471 wakaba 1.1 } else {
2472     !!!cp (200);
2473     !!!parse-error (type => 'string after PUBLIC literal');
2474     $self->{ct}->{quirks} = 1;
2475    
2476     $self->{state} = BOGUS_DOCTYPE_STATE;
2477     !!!next-input-character;
2478     redo A;
2479     }
2480     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2481     if ($is_space->{$self->{nc}}) {
2482     !!!cp (201);
2483     ## Stay in the state
2484     !!!next-input-character;
2485     redo A;
2486     } elsif ($self->{nc} == 0x0022) { # "
2487     !!!cp (202);
2488     $self->{ct}->{sysid} = ''; # DOCTYPE
2489     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2490     !!!next-input-character;
2491     redo A;
2492     } elsif ($self->{nc} == 0x0027) { # '
2493     !!!cp (203);
2494     $self->{ct}->{sysid} = ''; # DOCTYPE
2495     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2496     !!!next-input-character;
2497     redo A;
2498     } elsif ($self->{nc} == 0x003E) { # >
2499     !!!cp (204);
2500     !!!parse-error (type => 'no SYSTEM literal');
2501     $self->{state} = DATA_STATE;
2502 wakaba 1.5 $self->{s_kwd} = '';
2503 wakaba 1.1 !!!next-input-character;
2504    
2505     $self->{ct}->{quirks} = 1;
2506     !!!emit ($self->{ct}); # DOCTYPE
2507    
2508     redo A;
2509     } elsif ($self->{nc} == -1) {
2510     !!!cp (205);
2511     !!!parse-error (type => 'unclosed DOCTYPE');
2512    
2513     $self->{state} = DATA_STATE;
2514 wakaba 1.5 $self->{s_kwd} = '';
2515 wakaba 1.1 ## reconsume
2516    
2517     $self->{ct}->{quirks} = 1;
2518     !!!emit ($self->{ct}); # DOCTYPE
2519    
2520     redo A;
2521 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2522     !!!cp (206.1);
2523     !!!parse-error (type => 'no SYSTEM literal');
2524    
2525     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2526     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2527 wakaba 1.13 $self->{in_subset} = 1;
2528 wakaba 1.12 !!!next-input-character;
2529 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2530 wakaba 1.12 redo A;
2531 wakaba 1.1 } else {
2532     !!!cp (206);
2533     !!!parse-error (type => 'string after SYSTEM');
2534     $self->{ct}->{quirks} = 1;
2535    
2536     $self->{state} = BOGUS_DOCTYPE_STATE;
2537     !!!next-input-character;
2538     redo A;
2539     }
2540     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2541     if ($self->{nc} == 0x0022) { # "
2542     !!!cp (207);
2543     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2544     !!!next-input-character;
2545     redo A;
2546 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2547 wakaba 1.1 !!!cp (208);
2548     !!!parse-error (type => 'unclosed SYSTEM literal');
2549    
2550     $self->{state} = DATA_STATE;
2551 wakaba 1.5 $self->{s_kwd} = '';
2552 wakaba 1.1 !!!next-input-character;
2553    
2554     $self->{ct}->{quirks} = 1;
2555     !!!emit ($self->{ct}); # DOCTYPE
2556    
2557     redo A;
2558     } elsif ($self->{nc} == -1) {
2559     !!!cp (209);
2560     !!!parse-error (type => 'unclosed SYSTEM literal');
2561    
2562     $self->{state} = DATA_STATE;
2563 wakaba 1.5 $self->{s_kwd} = '';
2564 wakaba 1.1 ## reconsume
2565    
2566     $self->{ct}->{quirks} = 1;
2567     !!!emit ($self->{ct}); # DOCTYPE
2568    
2569     redo A;
2570     } else {
2571     !!!cp (210);
2572     $self->{ct}->{sysid} # DOCTYPE
2573     .= chr $self->{nc};
2574     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2575     length $self->{ct}->{sysid});
2576    
2577     ## Stay in the state
2578     !!!next-input-character;
2579     redo A;
2580     }
2581     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2582     if ($self->{nc} == 0x0027) { # '
2583     !!!cp (211);
2584     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2585     !!!next-input-character;
2586     redo A;
2587 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2588 wakaba 1.1 !!!cp (212);
2589     !!!parse-error (type => 'unclosed SYSTEM literal');
2590    
2591     $self->{state} = DATA_STATE;
2592 wakaba 1.5 $self->{s_kwd} = '';
2593 wakaba 1.1 !!!next-input-character;
2594    
2595     $self->{ct}->{quirks} = 1;
2596     !!!emit ($self->{ct}); # DOCTYPE
2597    
2598     redo A;
2599     } elsif ($self->{nc} == -1) {
2600     !!!cp (213);
2601     !!!parse-error (type => 'unclosed SYSTEM literal');
2602    
2603     $self->{state} = DATA_STATE;
2604 wakaba 1.5 $self->{s_kwd} = '';
2605 wakaba 1.1 ## reconsume
2606    
2607     $self->{ct}->{quirks} = 1;
2608     !!!emit ($self->{ct}); # DOCTYPE
2609    
2610     redo A;
2611     } else {
2612     !!!cp (214);
2613     $self->{ct}->{sysid} # DOCTYPE
2614     .= chr $self->{nc};
2615     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2616     length $self->{ct}->{sysid});
2617    
2618     ## Stay in the state
2619     !!!next-input-character;
2620     redo A;
2621     }
2622     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2623     if ($is_space->{$self->{nc}}) {
2624     !!!cp (215);
2625     ## Stay in the state
2626     !!!next-input-character;
2627     redo A;
2628     } elsif ($self->{nc} == 0x003E) { # >
2629     !!!cp (216);
2630     $self->{state} = DATA_STATE;
2631 wakaba 1.5 $self->{s_kwd} = '';
2632 wakaba 1.1 !!!next-input-character;
2633    
2634     !!!emit ($self->{ct}); # DOCTYPE
2635    
2636     redo A;
2637     } elsif ($self->{nc} == -1) {
2638     !!!cp (217);
2639     !!!parse-error (type => 'unclosed DOCTYPE');
2640     $self->{state} = DATA_STATE;
2641 wakaba 1.5 $self->{s_kwd} = '';
2642 wakaba 1.1 ## reconsume
2643    
2644     $self->{ct}->{quirks} = 1;
2645     !!!emit ($self->{ct}); # DOCTYPE
2646    
2647     redo A;
2648 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2649     !!!cp (218.1);
2650     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2651     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2652 wakaba 1.13 $self->{in_subset} = 1;
2653 wakaba 1.12 !!!next-input-character;
2654 wakaba 1.13 !!!emit ($self->{ct}); # DOCTYPE
2655 wakaba 1.12 redo A;
2656 wakaba 1.1 } else {
2657     !!!cp (218);
2658     !!!parse-error (type => 'string after SYSTEM literal');
2659     #$self->{ct}->{quirks} = 1;
2660    
2661     $self->{state} = BOGUS_DOCTYPE_STATE;
2662     !!!next-input-character;
2663     redo A;
2664     }
2665     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2666     if ($self->{nc} == 0x003E) { # >
2667     !!!cp (219);
2668     $self->{state} = DATA_STATE;
2669 wakaba 1.5 $self->{s_kwd} = '';
2670 wakaba 1.1 !!!next-input-character;
2671    
2672     !!!emit ($self->{ct}); # DOCTYPE
2673    
2674     redo A;
2675 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2676 wakaba 1.13 !!!cp (220.1);
2677     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2679     $self->{in_subset} = 1;
2680     !!!next-input-character;
2681     !!!emit ($self->{ct}); # DOCTYPE
2682     redo A;
2683 wakaba 1.1 } elsif ($self->{nc} == -1) {
2684     !!!cp (220);
2685     $self->{state} = DATA_STATE;
2686 wakaba 1.5 $self->{s_kwd} = '';
2687 wakaba 1.1 ## reconsume
2688    
2689     !!!emit ($self->{ct}); # DOCTYPE
2690    
2691     redo A;
2692     } else {
2693     !!!cp (221);
2694     my $s = '';
2695 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2696 wakaba 1.1
2697     ## Stay in the state
2698     !!!next-input-character;
2699     redo A;
2700     }
2701     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2702     ## NOTE: "CDATA section state" in the state is jointly implemented
2703     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2704     ## and |CDATA_SECTION_MSE2_STATE|.
2705 wakaba 1.10
2706     ## XML5: "CDATA state".
2707 wakaba 1.1
2708     if ($self->{nc} == 0x005D) { # ]
2709     !!!cp (221.1);
2710     $self->{state} = CDATA_SECTION_MSE1_STATE;
2711     !!!next-input-character;
2712     redo A;
2713     } elsif ($self->{nc} == -1) {
2714 wakaba 1.6 if ($self->{is_xml}) {
2715 wakaba 1.8 !!!cp (221.11);
2716 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2717 wakaba 1.8 } else {
2718     !!!cp (221.12);
2719 wakaba 1.6 }
2720    
2721 wakaba 1.1 $self->{state} = DATA_STATE;
2722 wakaba 1.5 $self->{s_kwd} = '';
2723 wakaba 1.10 ## Reconsume.
2724 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2725     !!!cp (221.2);
2726     !!!emit ($self->{ct}); # character
2727     } else {
2728     !!!cp (221.3);
2729     ## No token to emit. $self->{ct} is discarded.
2730     }
2731     redo A;
2732     } else {
2733     !!!cp (221.4);
2734     $self->{ct}->{data} .= chr $self->{nc};
2735     $self->{read_until}->($self->{ct}->{data},
2736     q<]>,
2737     length $self->{ct}->{data});
2738    
2739     ## Stay in the state.
2740     !!!next-input-character;
2741     redo A;
2742     }
2743    
2744     ## ISSUE: "text tokens" in spec.
2745     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2746 wakaba 1.10 ## XML5: "CDATA bracket state".
2747    
2748 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2749     !!!cp (221.5);
2750     $self->{state} = CDATA_SECTION_MSE2_STATE;
2751     !!!next-input-character;
2752     redo A;
2753     } else {
2754     !!!cp (221.6);
2755 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2756 wakaba 1.1 $self->{ct}->{data} .= ']';
2757 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2758 wakaba 1.1 ## Reconsume.
2759     redo A;
2760     }
2761     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2762 wakaba 1.10 ## XML5: "CDATA end state".
2763    
2764 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2765     $self->{state} = DATA_STATE;
2766 wakaba 1.5 $self->{s_kwd} = '';
2767 wakaba 1.1 !!!next-input-character;
2768     if (length $self->{ct}->{data}) { # character
2769     !!!cp (221.7);
2770     !!!emit ($self->{ct}); # character
2771     } else {
2772     !!!cp (221.8);
2773     ## No token to emit. $self->{ct} is discarded.
2774     }
2775     redo A;
2776     } elsif ($self->{nc} == 0x005D) { # ]
2777     !!!cp (221.9); # character
2778     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2779     ## Stay in the state.
2780     !!!next-input-character;
2781     redo A;
2782     } else {
2783     !!!cp (221.11);
2784     $self->{ct}->{data} .= ']]'; # character
2785     $self->{state} = CDATA_SECTION_STATE;
2786 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2787 wakaba 1.1 redo A;
2788     }
2789     } elsif ($self->{state} == ENTITY_STATE) {
2790     if ($is_space->{$self->{nc}} or
2791     {
2792     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2793     $self->{entity_add} => 1,
2794     }->{$self->{nc}}) {
2795     !!!cp (1001);
2796     ## Don't consume
2797     ## No error
2798     ## Return nothing.
2799     #
2800     } elsif ($self->{nc} == 0x0023) { # #
2801     !!!cp (999);
2802     $self->{state} = ENTITY_HASH_STATE;
2803 wakaba 1.12 $self->{kwd} = '#';
2804 wakaba 1.1 !!!next-input-character;
2805     redo A;
2806     } elsif ((0x0041 <= $self->{nc} and
2807     $self->{nc} <= 0x005A) or # A..Z
2808     (0x0061 <= $self->{nc} and
2809     $self->{nc} <= 0x007A)) { # a..z
2810     !!!cp (998);
2811     require Whatpm::_NamedEntityList;
2812     $self->{state} = ENTITY_NAME_STATE;
2813 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2814     $self->{entity__value} = $self->{kwd};
2815 wakaba 1.1 $self->{entity__match} = 0;
2816     !!!next-input-character;
2817     redo A;
2818     } else {
2819     !!!cp (1027);
2820     !!!parse-error (type => 'bare ero');
2821     ## Return nothing.
2822     #
2823     }
2824    
2825     ## NOTE: No character is consumed by the "consume a character
2826     ## reference" algorithm. In other word, there is an "&" character
2827     ## that does not introduce a character reference, which would be
2828     ## appended to the parent element or the attribute value in later
2829     ## process of the tokenizer.
2830    
2831     if ($self->{prev_state} == DATA_STATE) {
2832     !!!cp (997);
2833     $self->{state} = $self->{prev_state};
2834 wakaba 1.5 $self->{s_kwd} = '';
2835 wakaba 1.1 ## Reconsume.
2836     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2837     line => $self->{line_prev},
2838     column => $self->{column_prev},
2839     });
2840     redo A;
2841     } else {
2842     !!!cp (996);
2843     $self->{ca}->{value} .= '&';
2844     $self->{state} = $self->{prev_state};
2845 wakaba 1.5 $self->{s_kwd} = '';
2846 wakaba 1.1 ## Reconsume.
2847     redo A;
2848     }
2849     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2850     if ($self->{nc} == 0x0078 or # x
2851     $self->{nc} == 0x0058) { # X
2852     !!!cp (995);
2853     $self->{state} = HEXREF_X_STATE;
2854 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2855 wakaba 1.1 !!!next-input-character;
2856     redo A;
2857     } elsif (0x0030 <= $self->{nc} and
2858     $self->{nc} <= 0x0039) { # 0..9
2859     !!!cp (994);
2860     $self->{state} = NCR_NUM_STATE;
2861 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
2862 wakaba 1.1 !!!next-input-character;
2863     redo A;
2864     } else {
2865     !!!parse-error (type => 'bare nero',
2866     line => $self->{line_prev},
2867     column => $self->{column_prev} - 1);
2868    
2869     ## NOTE: According to the spec algorithm, nothing is returned,
2870     ## and then "&#" is appended to the parent element or the attribute
2871     ## value in the later processing.
2872    
2873     if ($self->{prev_state} == DATA_STATE) {
2874     !!!cp (1019);
2875     $self->{state} = $self->{prev_state};
2876 wakaba 1.5 $self->{s_kwd} = '';
2877 wakaba 1.1 ## Reconsume.
2878     !!!emit ({type => CHARACTER_TOKEN,
2879     data => '&#',
2880     line => $self->{line_prev},
2881     column => $self->{column_prev} - 1,
2882     });
2883     redo A;
2884     } else {
2885     !!!cp (993);
2886     $self->{ca}->{value} .= '&#';
2887     $self->{state} = $self->{prev_state};
2888 wakaba 1.5 $self->{s_kwd} = '';
2889 wakaba 1.1 ## Reconsume.
2890     redo A;
2891     }
2892     }
2893     } elsif ($self->{state} == NCR_NUM_STATE) {
2894     if (0x0030 <= $self->{nc} and
2895     $self->{nc} <= 0x0039) { # 0..9
2896     !!!cp (1012);
2897 wakaba 1.12 $self->{kwd} *= 10;
2898     $self->{kwd} += $self->{nc} - 0x0030;
2899 wakaba 1.1
2900     ## Stay in the state.
2901     !!!next-input-character;
2902     redo A;
2903     } elsif ($self->{nc} == 0x003B) { # ;
2904     !!!cp (1013);
2905     !!!next-input-character;
2906     #
2907     } else {
2908     !!!cp (1014);
2909     !!!parse-error (type => 'no refc');
2910     ## Reconsume.
2911     #
2912     }
2913    
2914 wakaba 1.12 my $code = $self->{kwd};
2915 wakaba 1.1 my $l = $self->{line_prev};
2916     my $c = $self->{column_prev};
2917     if ($charref_map->{$code}) {
2918     !!!cp (1015);
2919     !!!parse-error (type => 'invalid character reference',
2920     text => (sprintf 'U+%04X', $code),
2921     line => $l, column => $c);
2922     $code = $charref_map->{$code};
2923     } elsif ($code > 0x10FFFF) {
2924     !!!cp (1016);
2925     !!!parse-error (type => 'invalid character reference',
2926     text => (sprintf 'U-%08X', $code),
2927     line => $l, column => $c);
2928     $code = 0xFFFD;
2929     }
2930    
2931     if ($self->{prev_state} == DATA_STATE) {
2932     !!!cp (992);
2933     $self->{state} = $self->{prev_state};
2934 wakaba 1.5 $self->{s_kwd} = '';
2935 wakaba 1.1 ## Reconsume.
2936     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2937 wakaba 1.7 has_reference => 1,
2938 wakaba 1.1 line => $l, column => $c,
2939     });
2940     redo A;
2941     } else {
2942     !!!cp (991);
2943     $self->{ca}->{value} .= chr $code;
2944     $self->{ca}->{has_reference} = 1;
2945     $self->{state} = $self->{prev_state};
2946 wakaba 1.5 $self->{s_kwd} = '';
2947 wakaba 1.1 ## Reconsume.
2948     redo A;
2949     }
2950     } elsif ($self->{state} == HEXREF_X_STATE) {
2951     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2952     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2953     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2954     # 0..9, A..F, a..f
2955     !!!cp (990);
2956     $self->{state} = HEXREF_HEX_STATE;
2957 wakaba 1.12 $self->{kwd} = 0;
2958 wakaba 1.1 ## Reconsume.
2959     redo A;
2960     } else {
2961     !!!parse-error (type => 'bare hcro',
2962     line => $self->{line_prev},
2963     column => $self->{column_prev} - 2);
2964    
2965     ## NOTE: According to the spec algorithm, nothing is returned,
2966     ## and then "&#" followed by "X" or "x" is appended to the parent
2967     ## element or the attribute value in the later processing.
2968    
2969     if ($self->{prev_state} == DATA_STATE) {
2970     !!!cp (1005);
2971     $self->{state} = $self->{prev_state};
2972 wakaba 1.5 $self->{s_kwd} = '';
2973 wakaba 1.1 ## Reconsume.
2974     !!!emit ({type => CHARACTER_TOKEN,
2975 wakaba 1.12 data => '&' . $self->{kwd},
2976 wakaba 1.1 line => $self->{line_prev},
2977 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
2978 wakaba 1.1 });
2979     redo A;
2980     } else {
2981     !!!cp (989);
2982 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
2983 wakaba 1.1 $self->{state} = $self->{prev_state};
2984 wakaba 1.5 $self->{s_kwd} = '';
2985 wakaba 1.1 ## Reconsume.
2986     redo A;
2987     }
2988     }
2989     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2990     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2991     # 0..9
2992     !!!cp (1002);
2993 wakaba 1.12 $self->{kwd} *= 0x10;
2994     $self->{kwd} += $self->{nc} - 0x0030;
2995 wakaba 1.1 ## Stay in the state.
2996     !!!next-input-character;
2997     redo A;
2998     } elsif (0x0061 <= $self->{nc} and
2999     $self->{nc} <= 0x0066) { # a..f
3000     !!!cp (1003);
3001 wakaba 1.12 $self->{kwd} *= 0x10;
3002     $self->{kwd} += $self->{nc} - 0x0060 + 9;
3003 wakaba 1.1 ## Stay in the state.
3004     !!!next-input-character;
3005     redo A;
3006     } elsif (0x0041 <= $self->{nc} and
3007     $self->{nc} <= 0x0046) { # A..F
3008     !!!cp (1004);
3009 wakaba 1.12 $self->{kwd} *= 0x10;
3010     $self->{kwd} += $self->{nc} - 0x0040 + 9;
3011 wakaba 1.1 ## Stay in the state.
3012     !!!next-input-character;
3013     redo A;
3014     } elsif ($self->{nc} == 0x003B) { # ;
3015     !!!cp (1006);
3016     !!!next-input-character;
3017     #
3018     } else {
3019     !!!cp (1007);
3020     !!!parse-error (type => 'no refc',
3021     line => $self->{line},
3022     column => $self->{column});
3023     ## Reconsume.
3024     #
3025     }
3026    
3027 wakaba 1.12 my $code = $self->{kwd};
3028 wakaba 1.1 my $l = $self->{line_prev};
3029     my $c = $self->{column_prev};
3030     if ($charref_map->{$code}) {
3031     !!!cp (1008);
3032     !!!parse-error (type => 'invalid character reference',
3033     text => (sprintf 'U+%04X', $code),
3034     line => $l, column => $c);
3035     $code = $charref_map->{$code};
3036     } elsif ($code > 0x10FFFF) {
3037     !!!cp (1009);
3038     !!!parse-error (type => 'invalid character reference',
3039     text => (sprintf 'U-%08X', $code),
3040     line => $l, column => $c);
3041     $code = 0xFFFD;
3042     }
3043    
3044     if ($self->{prev_state} == DATA_STATE) {
3045     !!!cp (988);
3046     $self->{state} = $self->{prev_state};
3047 wakaba 1.5 $self->{s_kwd} = '';
3048 wakaba 1.1 ## Reconsume.
3049     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3050 wakaba 1.7 has_reference => 1,
3051 wakaba 1.1 line => $l, column => $c,
3052     });
3053     redo A;
3054     } else {
3055     !!!cp (987);
3056     $self->{ca}->{value} .= chr $code;
3057     $self->{ca}->{has_reference} = 1;
3058     $self->{state} = $self->{prev_state};
3059 wakaba 1.5 $self->{s_kwd} = '';
3060 wakaba 1.1 ## Reconsume.
3061     redo A;
3062     }
3063     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3064 wakaba 1.12 if (length $self->{kwd} < 30 and
3065 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
3066     ((0x0041 <= $self->{nc} and # a
3067     $self->{nc} <= 0x005A) or # x
3068     (0x0061 <= $self->{nc} and # a
3069     $self->{nc} <= 0x007A) or # z
3070     (0x0030 <= $self->{nc} and # 0
3071     $self->{nc} <= 0x0039) or # 9
3072     $self->{nc} == 0x003B)) { # ;
3073     our $EntityChar;
3074 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3075     if (defined $EntityChar->{$self->{kwd}}) {
3076 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
3077     !!!cp (1020);
3078 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3079 wakaba 1.1 $self->{entity__match} = 1;
3080     !!!next-input-character;
3081     #
3082     } else {
3083     !!!cp (1021);
3084 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3085 wakaba 1.1 $self->{entity__match} = -1;
3086     ## Stay in the state.
3087     !!!next-input-character;
3088     redo A;
3089     }
3090     } else {
3091     !!!cp (1022);
3092     $self->{entity__value} .= chr $self->{nc};
3093     $self->{entity__match} *= 2;
3094     ## Stay in the state.
3095     !!!next-input-character;
3096     redo A;
3097     }
3098     }
3099    
3100     my $data;
3101     my $has_ref;
3102     if ($self->{entity__match} > 0) {
3103     !!!cp (1023);
3104     $data = $self->{entity__value};
3105     $has_ref = 1;
3106     #
3107     } elsif ($self->{entity__match} < 0) {
3108     !!!parse-error (type => 'no refc');
3109     if ($self->{prev_state} != DATA_STATE and # in attribute
3110     $self->{entity__match} < -1) {
3111     !!!cp (1024);
3112 wakaba 1.12 $data = '&' . $self->{kwd};
3113 wakaba 1.1 #
3114     } else {
3115     !!!cp (1025);
3116     $data = $self->{entity__value};
3117     $has_ref = 1;
3118     #
3119     }
3120     } else {
3121     !!!cp (1026);
3122     !!!parse-error (type => 'bare ero',
3123     line => $self->{line_prev},
3124 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3125     $data = '&' . $self->{kwd};
3126 wakaba 1.1 #
3127     }
3128    
3129     ## NOTE: In these cases, when a character reference is found,
3130     ## it is consumed and a character token is returned, or, otherwise,
3131     ## nothing is consumed and returned, according to the spec algorithm.
3132     ## In this implementation, anything that has been examined by the
3133     ## tokenizer is appended to the parent element or the attribute value
3134     ## as string, either literal string when no character reference or
3135     ## entity-replaced string otherwise, in this stage, since any characters
3136     ## that would not be consumed are appended in the data state or in an
3137     ## appropriate attribute value state anyway.
3138    
3139     if ($self->{prev_state} == DATA_STATE) {
3140     !!!cp (986);
3141     $self->{state} = $self->{prev_state};
3142 wakaba 1.5 $self->{s_kwd} = '';
3143 wakaba 1.1 ## Reconsume.
3144     !!!emit ({type => CHARACTER_TOKEN,
3145     data => $data,
3146 wakaba 1.7 has_reference => $has_ref,
3147 wakaba 1.1 line => $self->{line_prev},
3148 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3149 wakaba 1.1 });
3150     redo A;
3151     } else {
3152     !!!cp (985);
3153     $self->{ca}->{value} .= $data;
3154     $self->{ca}->{has_reference} = 1 if $has_ref;
3155     $self->{state} = $self->{prev_state};
3156 wakaba 1.5 $self->{s_kwd} = '';
3157 wakaba 1.1 ## Reconsume.
3158     redo A;
3159     }
3160 wakaba 1.8
3161     ## XML-only states
3162    
3163     } elsif ($self->{state} == PI_STATE) {
3164 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
3165    
3166 wakaba 1.8 if ($is_space->{$self->{nc}} or
3167 wakaba 1.14 $self->{nc} == 0x003F or # ?
3168 wakaba 1.8 $self->{nc} == -1) {
3169 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3170     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3171     ## "DOCTYPE pi state": Parse error, switch to the "data
3172     ## state".
3173 wakaba 1.8 !!!parse-error (type => 'bare pio', ## TODO: type
3174     line => $self->{line_prev},
3175     column => $self->{column_prev}
3176     - 1 * ($self->{nc} != -1));
3177     $self->{state} = BOGUS_COMMENT_STATE;
3178     ## Reconsume.
3179     $self->{ct} = {type => COMMENT_TOKEN,
3180     data => '?',
3181     line => $self->{line_prev},
3182     column => $self->{column_prev}
3183     - 1 * ($self->{nc} != -1),
3184     };
3185     redo A;
3186     } else {
3187 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
3188 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
3189     target => chr $self->{nc},
3190     data => '',
3191     line => $self->{line_prev},
3192     column => $self->{column_prev} - 1,
3193     };
3194     $self->{state} = PI_TARGET_STATE;
3195     !!!next-input-character;
3196     redo A;
3197     }
3198     } elsif ($self->{state} == PI_TARGET_STATE) {
3199     if ($is_space->{$self->{nc}}) {
3200     $self->{state} = PI_TARGET_AFTER_STATE;
3201     !!!next-input-character;
3202     redo A;
3203     } elsif ($self->{nc} == -1) {
3204     !!!parse-error (type => 'no pic'); ## TODO: type
3205 wakaba 1.13 if ($self->{in_subset}) {
3206     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3207     } else {
3208     $self->{state} = DATA_STATE;
3209     $self->{s_kwd} = '';
3210     }
3211 wakaba 1.8 ## Reconsume.
3212     !!!emit ($self->{ct}); # pi
3213     redo A;
3214     } elsif ($self->{nc} == 0x003F) { # ?
3215     $self->{state} = PI_AFTER_STATE;
3216     !!!next-input-character;
3217     redo A;
3218     } else {
3219     ## XML5: typo ("tag name" -> "target")
3220     $self->{ct}->{target} .= chr $self->{nc}; # pi
3221     !!!next-input-character;
3222     redo A;
3223     }
3224     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3225     if ($is_space->{$self->{nc}}) {
3226     ## Stay in the state.
3227     !!!next-input-character;
3228     redo A;
3229     } else {
3230     $self->{state} = PI_DATA_STATE;
3231     ## Reprocess.
3232     redo A;
3233     }
3234     } elsif ($self->{state} == PI_DATA_STATE) {
3235     if ($self->{nc} == 0x003F) { # ?
3236     $self->{state} = PI_DATA_AFTER_STATE;
3237     !!!next-input-character;
3238     redo A;
3239     } elsif ($self->{nc} == -1) {
3240     !!!parse-error (type => 'no pic'); ## TODO: type
3241 wakaba 1.13 if ($self->{in_subset}) {
3242 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3243 wakaba 1.13 } else {
3244     $self->{state} = DATA_STATE;
3245     $self->{s_kwd} = '';
3246     }
3247 wakaba 1.8 ## Reprocess.
3248     !!!emit ($self->{ct}); # pi
3249     redo A;
3250     } else {
3251     $self->{ct}->{data} .= chr $self->{nc}; # pi
3252     $self->{read_until}->($self->{ct}->{data}, q[?],
3253     length $self->{ct}->{data});
3254     ## Stay in the state.
3255     !!!next-input-character;
3256     ## Reprocess.
3257     redo A;
3258     }
3259     } elsif ($self->{state} == PI_AFTER_STATE) {
3260 wakaba 1.14 ## XML5: Part of "Pi after state".
3261    
3262 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3263 wakaba 1.13 if ($self->{in_subset}) {
3264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265     } else {
3266     $self->{state} = DATA_STATE;
3267     $self->{s_kwd} = '';
3268     }
3269 wakaba 1.8 !!!next-input-character;
3270     !!!emit ($self->{ct}); # pi
3271     redo A;
3272     } elsif ($self->{nc} == 0x003F) { # ?
3273     !!!parse-error (type => 'no s after target', ## TODO: type
3274     line => $self->{line_prev},
3275     column => $self->{column_prev}); ## XML5: no error
3276     $self->{ct}->{data} .= '?';
3277     $self->{state} = PI_DATA_AFTER_STATE;
3278     !!!next-input-character;
3279     redo A;
3280     } else {
3281     !!!parse-error (type => 'no s after target', ## TODO: type
3282     line => $self->{line_prev},
3283     column => $self->{column_prev}
3284     + 1 * ($self->{nc} == -1)); ## XML5: no error
3285     $self->{ct}->{data} .= '?'; ## XML5: not appended
3286     $self->{state} = PI_DATA_STATE;
3287     ## Reprocess.
3288     redo A;
3289     }
3290     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3291 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3292    
3293 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
3294 wakaba 1.13 if ($self->{in_subset}) {
3295     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3296     } else {
3297     $self->{state} = DATA_STATE;
3298     $self->{s_kwd} = '';
3299     }
3300 wakaba 1.8 !!!next-input-character;
3301     !!!emit ($self->{ct}); # pi
3302     redo A;
3303     } elsif ($self->{nc} == 0x003F) { # ?
3304     $self->{ct}->{data} .= '?';
3305     ## Stay in the state.
3306     !!!next-input-character;
3307     redo A;
3308     } else {
3309     $self->{ct}->{data} .= '?'; ## XML5: not appended
3310     $self->{state} = PI_DATA_STATE;
3311     ## Reprocess.
3312     redo A;
3313     }
3314 wakaba 1.12
3315     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3316     if ($self->{nc} == 0x003C) { # <
3317 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
3318 wakaba 1.12 !!!next-input-character;
3319     redo A;
3320     } elsif ($self->{nc} == 0x0025) { # %
3321     ## XML5: Not defined yet.
3322    
3323     ## TODO:
3324     !!!next-input-character;
3325     redo A;
3326     } elsif ($self->{nc} == 0x005D) { # ]
3327 wakaba 1.13 delete $self->{in_subset};
3328 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3329     !!!next-input-character;
3330     redo A;
3331     } elsif ($is_space->{$self->{nc}}) {
3332     ## Stay in the state.
3333     !!!next-input-character;
3334     redo A;
3335     } elsif ($self->{nc} == -1) {
3336     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3337 wakaba 1.13 delete $self->{in_subset};
3338 wakaba 1.12 $self->{state} = DATA_STATE;
3339     $self->{s_kwd} = '';
3340     ## Reconsume.
3341 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3342 wakaba 1.12 redo A;
3343     } else {
3344     unless ($self->{internal_subset_tainted}) {
3345     ## XML5: No parse error.
3346     !!!parse-error (type => 'string in internal subset');
3347     $self->{internal_subset_tainted} = 1;
3348     }
3349     ## Stay in the state.
3350     !!!next-input-character;
3351     redo A;
3352     }
3353     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3354     if ($self->{nc} == 0x003E) { # >
3355     $self->{state} = DATA_STATE;
3356     $self->{s_kwd} = '';
3357     !!!next-input-character;
3358 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3359 wakaba 1.12 redo A;
3360     } elsif ($self->{nc} == -1) {
3361     !!!parse-error (type => 'unclosed DOCTYPE');
3362     $self->{state} = DATA_STATE;
3363     $self->{s_kwd} = '';
3364     ## Reconsume.
3365 wakaba 1.13 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3366 wakaba 1.12 redo A;
3367     } else {
3368     ## XML5: No parse error and stay in the state.
3369     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3370    
3371 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3372     !!!next-input-character;
3373     redo A;
3374     }
3375     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3376     if ($self->{nc} == 0x003E) { # >
3377     $self->{state} = DATA_STATE;
3378     $self->{s_kwd} = '';
3379     !!!next-input-character;
3380     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3381     redo A;
3382     } elsif ($self->{nc} == -1) {
3383     $self->{state} = DATA_STATE;
3384     $self->{s_kwd} = '';
3385     ## Reconsume.
3386     !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3387     redo A;
3388     } else {
3389     ## Stay in the state.
3390     !!!next-input-character;
3391     redo A;
3392     }
3393     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3394     if ($self->{nc} == 0x0021) { # !
3395 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3396 wakaba 1.13 !!!next-input-character;
3397     redo A;
3398     } elsif ($self->{nc} == 0x003F) { # ?
3399     $self->{state} = PI_STATE;
3400     !!!next-input-character;
3401     redo A;
3402     } elsif ($self->{nc} == -1) {
3403     !!!parse-error (type => 'bare stago');
3404     $self->{state} = DATA_STATE;
3405     $self->{s_kwd} = '';
3406     ## Reconsume.
3407     redo A;
3408     } else {
3409     !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3410     line => $self->{line_prev},
3411     column => $self->{column_prev});
3412     $self->{state} = BOGUS_COMMENT_STATE;
3413     $self->{ct} = {type => COMMENT_TOKEN,
3414     data => '',
3415     }; ## NOTE: Will be discarded.
3416 wakaba 1.12 !!!next-input-character;
3417     redo A;
3418     }
3419 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3420     ## XML5: "DOCTYPE markup declaration state".
3421    
3422     if ($self->{nc} == 0x002D) { # -
3423     $self->{state} = MD_HYPHEN_STATE;
3424     !!!next-input-character;
3425     redo A;
3426     } elsif ($self->{nc} == 0x0045) { # E
3427     $self->{state} = MD_E_STATE;
3428     $self->{kwd} = chr $self->{nc};
3429     !!!next-input-character;
3430     redo A;
3431     } elsif ($self->{nc} == 0x0041) { # A
3432     $self->{state} = MD_ATTLIST_STATE;
3433     $self->{kwd} = chr $self->{nc};
3434     !!!next-input-character;
3435     redo A;
3436     } elsif ($self->{nc} == 0x004E) { # N
3437     $self->{state} = MD_NOTATION_STATE;
3438     $self->{kwd} = chr $self->{nc};
3439     !!!next-input-character;
3440     redo A;
3441     } else {
3442     #
3443     }
3444    
3445     ## XML5: No parse error.
3446     !!!parse-error (type => 'bogus comment',
3447     line => $self->{line_prev},
3448     column => $self->{column_prev} - 1);
3449     ## Reconsume.
3450     $self->{state} = BOGUS_COMMENT_STATE;
3451     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3452     redo A;
3453     } elsif ($self->{state} == MD_E_STATE) {
3454     if ($self->{nc} == 0x004E) { # N
3455     $self->{state} = MD_ENTITY_STATE;
3456     $self->{kwd} .= chr $self->{nc};
3457     !!!next-input-character;
3458     redo A;
3459     } elsif ($self->{nc} == 0x004C) { # L
3460     ## XML5: <!ELEMENT> not supported.
3461     $self->{state} = MD_ELEMENT_STATE;
3462     $self->{kwd} .= chr $self->{nc};
3463     !!!next-input-character;
3464     redo A;
3465     } else {
3466     ## XML5: No parse error.
3467     !!!parse-error (type => 'bogus comment',
3468     line => $self->{line_prev},
3469     column => $self->{column_prev} - 2
3470     + 1 * ($self->{nc} == -1));
3471     ## Reconsume.
3472     $self->{state} = BOGUS_COMMENT_STATE;
3473     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3474     redo A;
3475     }
3476     } elsif ($self->{state} == MD_ENTITY_STATE) {
3477     if ($self->{nc} == {
3478     'EN' => 0x0054, # T
3479     'ENT' => 0x0049, # I
3480     'ENTI' => 0x0054, # T
3481     }->{$self->{kwd}}) {
3482     ## Stay in the state.
3483     $self->{kwd} .= chr $self->{nc};
3484     !!!next-input-character;
3485     redo A;
3486     } elsif ($self->{kwd} eq 'ENTIT' and
3487     $self->{nc} == 0x0059) { # Y
3488     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3489     line => $self->{line_prev},
3490     column => $self->{column_prev} - 6};
3491     $self->{state} = DOCTYPE_MD_STATE;
3492     !!!next-input-character;
3493     redo A;
3494     } else {
3495     !!!parse-error (type => 'bogus comment',
3496     line => $self->{line_prev},
3497     column => $self->{column_prev} - 1
3498     - (length $self->{kwd})
3499     + 1 * ($self->{nc} == -1));
3500     $self->{state} = BOGUS_COMMENT_STATE;
3501     ## Reconsume.
3502     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3503     redo A;
3504     }
3505     } elsif ($self->{state} == MD_ELEMENT_STATE) {
3506     if ($self->{nc} == {
3507     'EL' => 0x0045, # E
3508     'ELE' => 0x004D, # M
3509     'ELEM' => 0x0045, # E
3510     'ELEME' => 0x004E, # N
3511     }->{$self->{kwd}}) {
3512     ## Stay in the state.
3513     $self->{kwd} .= chr $self->{nc};
3514     !!!next-input-character;
3515     redo A;
3516     } elsif ($self->{kwd} eq 'ELEMEN' and
3517     $self->{nc} == 0x0054) { # T
3518     $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3519     line => $self->{line_prev},
3520     column => $self->{column_prev} - 6};
3521     $self->{state} = DOCTYPE_MD_STATE;
3522     !!!next-input-character;
3523     redo A;
3524     } else {
3525     !!!parse-error (type => 'bogus comment',
3526     line => $self->{line_prev},
3527     column => $self->{column_prev} - 1
3528     - (length $self->{kwd})
3529     + 1 * ($self->{nc} == -1));
3530     $self->{state} = BOGUS_COMMENT_STATE;
3531     ## Reconsume.
3532     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3533     redo A;
3534     }
3535     } elsif ($self->{state} == MD_ATTLIST_STATE) {
3536     if ($self->{nc} == {
3537     'A' => 0x0054, # T
3538     'AT' => 0x0054, # T
3539     'ATT' => 0x004C, # L
3540     'ATTL' => 0x0049, # I
3541     'ATTLI' => 0x0053, # S
3542     }->{$self->{kwd}}) {
3543     ## Stay in the state.
3544     $self->{kwd} .= chr $self->{nc};
3545     !!!next-input-character;
3546     redo A;
3547     } elsif ($self->{kwd} eq 'ATTLIS' and
3548     $self->{nc} == 0x0054) { # T
3549     $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3550     line => $self->{line_prev},
3551     column => $self->{column_prev} - 6};
3552     $self->{state} = DOCTYPE_MD_STATE;
3553     !!!next-input-character;
3554     redo A;
3555     } else {
3556     !!!parse-error (type => 'bogus comment',
3557     line => $self->{line_prev},
3558     column => $self->{column_prev} - 1
3559     - (length $self->{kwd})
3560     + 1 * ($self->{nc} == -1));
3561     $self->{state} = BOGUS_COMMENT_STATE;
3562     ## Reconsume.
3563     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3564     redo A;
3565     }
3566     } elsif ($self->{state} == MD_NOTATION_STATE) {
3567     if ($self->{nc} == {
3568     'N' => 0x004F, # O
3569     'NO' => 0x0054, # T
3570     'NOT' => 0x0041, # A
3571     'NOTA' => 0x0054, # T
3572     'NOTAT' => 0x0049, # I
3573     'NOTATI' => 0x004F, # O
3574     }->{$self->{kwd}}) {
3575     ## Stay in the state.
3576     $self->{kwd} .= chr $self->{nc};
3577     !!!next-input-character;
3578     redo A;
3579     } elsif ($self->{kwd} eq 'NOTATIO' and
3580     $self->{nc} == 0x004E) { # N
3581     $self->{ct} = {type => NOTATION_TOKEN, name => '',
3582     line => $self->{line_prev},
3583     column => $self->{column_prev} - 6};
3584     $self->{state} = DOCTYPE_MD_STATE;
3585     !!!next-input-character;
3586     redo A;
3587     } else {
3588     !!!parse-error (type => 'bogus comment',
3589     line => $self->{line_prev},
3590     column => $self->{column_prev} - 1
3591     - (length $self->{kwd})
3592     + 1 * ($self->{nc} == -1));
3593     $self->{state} = BOGUS_COMMENT_STATE;
3594     ## Reconsume.
3595     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3596     redo A;
3597     }
3598     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3599     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3600     ## "DOCTYPE NOTATION state".
3601    
3602     if ($is_space->{$self->{nc}}) {
3603     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3604     $self->{state} = BEFORE_MD_NAME_STATE;
3605     !!!next-input-character;
3606     redo A;
3607     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3608     $self->{nc} == 0x0025) { # %
3609     ## XML5: Switch to the "DOCTYPE bogus comment state".
3610     !!!parse-error (type => 'no space before md name'); ## TODO: type
3611     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3612     !!!next-input-character;
3613     redo A;
3614     } elsif ($self->{nc} == -1) {
3615     !!!parse-error (type => 'unclosed md'); ## TODO: type
3616     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3617     ## Reconsume.
3618     redo A;
3619     } elsif ($self->{nc} == 0x003E) { # >
3620     ## XML5: Switch to the "DOCTYPE bogus comment state".
3621     !!!parse-error (type => 'no md name'); ## TODO: type
3622     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3623     !!!next-input-character;
3624     redo A;
3625     } else {
3626     ## XML5: Switch to the "DOCTYPE bogus comment state".
3627     !!!parse-error (type => 'no space before md name'); ## TODO: type
3628     $self->{state} = BEFORE_MD_NAME_STATE;
3629     redo A;
3630     }
3631     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3632     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3633     ## before state", "DOCTYPE ATTLIST name before state".
3634    
3635     if ($is_space->{$self->{nc}}) {
3636     ## Stay in the state.
3637     !!!next-input-character;
3638     redo A;
3639     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3640     $self->{nc} == 0x0025) { # %
3641     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3642     !!!next-input-character;
3643     redo A;
3644     } elsif ($self->{nc} == 0x003E) { # >
3645     ## XML5: Same as "Anything else".
3646     !!!parse-error (type => 'no md name'); ## TODO: type
3647     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3648     !!!next-input-character;
3649     redo A;
3650     } elsif ($self->{nc} == -1) {
3651     !!!parse-error (type => 'unclosed md'); ## TODO: type
3652     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3653     ## Reconsume.
3654     redo A;
3655     } else {
3656     ## XML5: [ATTLIST] Not defined yet.
3657     $self->{ct}->{name} .= chr $self->{nc};
3658     $self->{state} = MD_NAME_STATE;
3659     !!!next-input-character;
3660     redo A;
3661     }
3662     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3663     if ($is_space->{$self->{nc}}) {
3664     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3665     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3666     $self->{state} = BEFORE_MD_NAME_STATE;
3667     !!!next-input-character;
3668     redo A;
3669     } elsif ($self->{nc} == 0x003E) { # >
3670     ## XML5: Same as "Anything else".
3671     !!!parse-error (type => 'no md name'); ## TODO: type
3672     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3673     !!!next-input-character;
3674     redo A;
3675     } elsif ($self->{nc} == -1) {
3676     !!!parse-error (type => 'unclosed md');
3677     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3678     ## Reconsume.
3679     redo A;
3680     } else {
3681     ## XML5: No parse error.
3682     !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3683     $self->{state} = BOGUS_COMMENT_STATE;
3684     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3685     ## Reconsume.
3686     redo A;
3687     }
3688     } elsif ($self->{state} == MD_NAME_STATE) {
3689     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3690    
3691     if ($is_space->{$self->{nc}}) {
3692     ## TODO:
3693     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3694     !!!next-input-character;
3695     redo A;
3696     } elsif ($self->{nc} == 0x003E) { # >
3697     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3698     #
3699     } else {
3700     !!!parse-error (type => 'no md body'); ## TODO: type
3701     }
3702     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3703     !!!next-input-character;
3704     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3705     redo A;
3706     } elsif ($self->{nc} == -1) {
3707     ## XML5: [ATTLIST] No parse error.
3708     !!!parse-error (type => 'unclosed md');
3709     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3710     ## Reconsume.
3711     !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3712     redo A;
3713     } else {
3714     ## XML5: [ATTLIST] Not defined yet.
3715     $self->{ct}->{name} .= chr $self->{nc};
3716     ## Stay in the state.
3717     !!!next-input-character;
3718     redo A;
3719     }
3720     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3721     if ($is_space->{$self->{nc}}) {
3722     ## Stay in the state.
3723     !!!next-input-character;
3724     redo A;
3725     } elsif ($self->{nc} == 0x003E) { # >
3726     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3727     !!!next-input-character;
3728     !!!emit ($self->{ct}); # ATTLIST
3729     redo A;
3730     } elsif ($self->{nc} == -1) {
3731     ## XML5: No parse error.
3732     !!!parse-error (type => 'unclosed md'); ## TODO: type
3733     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3734     redo A;
3735     } else {
3736     ## XML5: Not defined yet.
3737    
3738     ## TODO: ...
3739    
3740     $self->{state} = BOGUS_COMMENT_STATE;
3741     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742     ## Reconsume.
3743     redo A;
3744     }
3745    
3746 wakaba 1.1 } else {
3747     die "$0: $self->{state}: Unknown state";
3748     }
3749     } # A
3750    
3751     die "$0: _get_next_token: unexpected case";
3752     } # _get_next_token
3753    
3754     1;
3755 wakaba 1.14 ## $Date: 2008/10/16 03:39:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24