/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Fri Oct 17 07:14:29 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.13: +659 -12 lines
++ whatpm/t/ChangeLog	17 Oct 2008 07:14:01 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attlists-1.dat" added.

++ whatpm/t/xml/ChangeLog	17 Oct 2008 07:14:24 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat: New test data file.

	* doctypes-2.dat: New tests added.

++ whatpm/Whatpm/ChangeLog	17 Oct 2008 07:11:25 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (node_name): New attribute.
	(ELEMENT_TYPE_DEFINITION_NODE, ATTRIBUTE_DEFINITION_NODE): New
	constants.
	(create_element_type_definition_node, create_attribute_definition,
	create_notation, create_general_entity,
	get_element_type_definition_node,
	set_element_type_definition_node, get_general_entity_node,
	set_general_entity_node, get_notation_node, set_notation_node,
	get_attribute_definition_node, set_attribute_definition_node): New
	methods.
	(element_types, entities, notations, attribute_definitions): New
	attributes.
	(DocumentType): Support for child nodes, entities, notations, and
	element types.
	(Entity, Notation, ElementTypeDefinition, AttributeDefinition):
	New classes.

	* Dumper.pm: Support for general entities, notations, element type
	definitions, and attribute definitions.

++ whatpm/Whatpm/HTML/ChangeLog	17 Oct 2008 07:12:26 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token types AtTLIST_TOKEN, ELEMENT_TOKEN,
	GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and NOTATION_TOKEN
	are added.  New intertion modes for markup declarations are added.

++ whatpm/Whatpm/XML/ChangeLog	17 Oct 2008 07:13:47 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): Support for ELEMENT_TOKEN,
	ATTLIST_TOKEN, GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and
	NOTATION_TOKEN.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.14 our $VERSION=do{my @r=(q$Revision: 1.13 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.8
168 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
169     ## list and descriptions)
170    
171     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
172     sub FOREIGN_EL () { 0b1_00000000000 }
173    
174     ## Character reference mappings
175    
176     my $charref_map = {
177     0x0D => 0x000A,
178     0x80 => 0x20AC,
179     0x81 => 0xFFFD,
180     0x82 => 0x201A,
181     0x83 => 0x0192,
182     0x84 => 0x201E,
183     0x85 => 0x2026,
184     0x86 => 0x2020,
185     0x87 => 0x2021,
186     0x88 => 0x02C6,
187     0x89 => 0x2030,
188     0x8A => 0x0160,
189     0x8B => 0x2039,
190     0x8C => 0x0152,
191     0x8D => 0xFFFD,
192     0x8E => 0x017D,
193     0x8F => 0xFFFD,
194     0x90 => 0xFFFD,
195     0x91 => 0x2018,
196     0x92 => 0x2019,
197     0x93 => 0x201C,
198     0x94 => 0x201D,
199     0x95 => 0x2022,
200     0x96 => 0x2013,
201     0x97 => 0x2014,
202     0x98 => 0x02DC,
203     0x99 => 0x2122,
204     0x9A => 0x0161,
205     0x9B => 0x203A,
206     0x9C => 0x0153,
207     0x9D => 0xFFFD,
208     0x9E => 0x017E,
209     0x9F => 0x0178,
210     }; # $charref_map
211     $charref_map->{$_} = 0xFFFD
212     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
213     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
214     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
215     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
216     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
217     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
218     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
219    
220     ## Implementations MUST act as if state machine in the spec
221    
222     sub _initialize_tokenizer ($) {
223     my $self = shift;
224    
225     ## NOTE: Fields set by |new| constructor:
226     #$self->{level}
227     #$self->{set_nc}
228     #$self->{parse_error}
229 wakaba 1.3 #$self->{is_xml} (if XML)
230 wakaba 1.1
231     $self->{state} = DATA_STATE; # MUST
232 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
233     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
234 wakaba 1.1 #$self->{entity__value}; # initialized when used
235     #$self->{entity__match}; # initialized when used
236     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
237     undef $self->{ct}; # current token
238     undef $self->{ca}; # current attribute
239     undef $self->{last_stag_name}; # last emitted start tag name
240     #$self->{prev_state}; # initialized when used
241     delete $self->{self_closing};
242     $self->{char_buffer} = '';
243     $self->{char_buffer_pos} = 0;
244     $self->{nc} = -1; # next input character
245     #$self->{next_nc}
246    
247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
248     $self->{line_prev} = $self->{line};
249     $self->{column_prev} = $self->{column};
250     $self->{column}++;
251     $self->{nc}
252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
253     } else {
254     $self->{set_nc}->($self);
255     }
256    
257     $self->{token} = [];
258     # $self->{escape}
259     } # _initialize_tokenizer
260    
261     ## A token has:
262     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
263 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
264 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
265     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
266 wakaba 1.11 ## ->{target} (PI_TOKEN)
267 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
268     ## ->{sysid} (DOCTYPE_TOKEN)
269     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
270     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
271     ## ->{name}
272     ## ->{value}
273     ## ->{has_reference} == 1 or 0
274 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
275     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
276 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
277 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
278 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
279    
280 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
281     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
282     ## while the token is pushed back to the stack.
283    
284     ## Emitted token MUST immediately be handled by the tree construction state.
285    
286     ## Before each step, UA MAY check to see if either one of the scripts in
287     ## "list of scripts that will execute as soon as possible" or the first
288     ## script in the "list of scripts that will execute asynchronously",
289     ## has completed loading. If one has, then it MUST be executed
290     ## and removed from the list.
291    
292     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
293     ## (This requirement was dropped from HTML5 spec, unfortunately.)
294    
295     my $is_space = {
296     0x0009 => 1, # CHARACTER TABULATION (HT)
297     0x000A => 1, # LINE FEED (LF)
298     #0x000B => 0, # LINE TABULATION (VT)
299 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
300 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
301     0x0020 => 1, # SPACE (SP)
302     };
303    
304     sub _get_next_token ($) {
305     my $self = shift;
306    
307     if ($self->{self_closing}) {
308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
309     ## NOTE: The |self_closing| flag is only set by start tag token.
310     ## In addition, when a start tag token is emitted, it is always set to
311     ## |ct|.
312     delete $self->{self_closing};
313     }
314    
315     if (@{$self->{token}}) {
316     $self->{self_closing} = $self->{token}->[0]->{self_closing};
317     return shift @{$self->{token}};
318     }
319    
320     A: {
321     if ($self->{state} == PCDATA_STATE) {
322     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
323    
324     if ($self->{nc} == 0x0026) { # &
325    
326     ## NOTE: In the spec, the tokenizer is switched to the
327     ## "entity data state". In this implementation, the tokenizer
328     ## is switched to the |ENTITY_STATE|, which is an implementation
329     ## of the "consume a character reference" algorithm.
330     $self->{entity_add} = -1;
331     $self->{prev_state} = DATA_STATE;
332     $self->{state} = ENTITY_STATE;
333    
334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
335     $self->{line_prev} = $self->{line};
336     $self->{column_prev} = $self->{column};
337     $self->{column}++;
338     $self->{nc}
339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
340     } else {
341     $self->{set_nc}->($self);
342     }
343    
344     redo A;
345     } elsif ($self->{nc} == 0x003C) { # <
346    
347     $self->{state} = TAG_OPEN_STATE;
348    
349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
350     $self->{line_prev} = $self->{line};
351     $self->{column_prev} = $self->{column};
352     $self->{column}++;
353     $self->{nc}
354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
355     } else {
356     $self->{set_nc}->($self);
357     }
358    
359     redo A;
360     } elsif ($self->{nc} == -1) {
361    
362     return ({type => END_OF_FILE_TOKEN,
363     line => $self->{line}, column => $self->{column}});
364     last A; ## TODO: ok?
365     } else {
366    
367     #
368     }
369    
370     # Anything else
371     my $token = {type => CHARACTER_TOKEN,
372     data => chr $self->{nc},
373     line => $self->{line}, column => $self->{column},
374     };
375     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376    
377     ## Stay in the state.
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     return ($token);
390     redo A;
391     } elsif ($self->{state} == DATA_STATE) {
392     $self->{s_kwd} = '' unless defined $self->{s_kwd};
393     if ($self->{nc} == 0x0026) { # &
394     $self->{s_kwd} = '';
395     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
396     not $self->{escape}) {
397    
398     ## NOTE: In the spec, the tokenizer is switched to the
399     ## "entity data state". In this implementation, the tokenizer
400     ## is switched to the |ENTITY_STATE|, which is an implementation
401     ## of the "consume a character reference" algorithm.
402     $self->{entity_add} = -1;
403     $self->{prev_state} = DATA_STATE;
404     $self->{state} = ENTITY_STATE;
405    
406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
407     $self->{line_prev} = $self->{line};
408     $self->{column_prev} = $self->{column};
409     $self->{column}++;
410     $self->{nc}
411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
412     } else {
413     $self->{set_nc}->($self);
414     }
415    
416     redo A;
417     } else {
418    
419     #
420     }
421     } elsif ($self->{nc} == 0x002D) { # -
422     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
423 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
424 wakaba 1.1
425     $self->{escape} = 1; # unless $self->{escape};
426     $self->{s_kwd} = '--';
427     #
428 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
429 wakaba 1.1
430     $self->{s_kwd} = '--';
431     #
432 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
433    
434     $self->{s_kwd} .= '-';
435     #
436 wakaba 1.1 } else {
437    
438 wakaba 1.5 $self->{s_kwd} = '-';
439 wakaba 1.1 #
440     }
441     }
442    
443     #
444     } elsif ($self->{nc} == 0x0021) { # !
445     if (length $self->{s_kwd}) {
446    
447     $self->{s_kwd} .= '!';
448     #
449     } else {
450    
451     #$self->{s_kwd} = '';
452     #
453     }
454     #
455     } elsif ($self->{nc} == 0x003C) { # <
456     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
457     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
458     not $self->{escape})) {
459    
460     $self->{state} = TAG_OPEN_STATE;
461    
462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
463     $self->{line_prev} = $self->{line};
464     $self->{column_prev} = $self->{column};
465     $self->{column}++;
466     $self->{nc}
467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
468     } else {
469     $self->{set_nc}->($self);
470     }
471    
472     redo A;
473     } else {
474    
475     $self->{s_kwd} = '';
476     #
477     }
478     } elsif ($self->{nc} == 0x003E) { # >
479     if ($self->{escape} and
480     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
481     if ($self->{s_kwd} eq '--') {
482    
483     delete $self->{escape};
484 wakaba 1.5 #
485 wakaba 1.1 } else {
486    
487 wakaba 1.5 #
488 wakaba 1.1 }
489 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
490    
491     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
492     line => $self->{line_prev},
493     column => $self->{column_prev} - 1);
494     #
495 wakaba 1.1 } else {
496    
497 wakaba 1.5 #
498 wakaba 1.1 }
499    
500     $self->{s_kwd} = '';
501     #
502 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
503     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
504    
505     $self->{s_kwd} .= ']';
506     } elsif ($self->{s_kwd} eq ']]') {
507    
508     #
509     } else {
510    
511     $self->{s_kwd} = '';
512     }
513     #
514 wakaba 1.1 } elsif ($self->{nc} == -1) {
515    
516     $self->{s_kwd} = '';
517     return ({type => END_OF_FILE_TOKEN,
518     line => $self->{line}, column => $self->{column}});
519     last A; ## TODO: ok?
520     } else {
521    
522     $self->{s_kwd} = '';
523     #
524     }
525    
526     # Anything else
527     my $token = {type => CHARACTER_TOKEN,
528     data => chr $self->{nc},
529     line => $self->{line}, column => $self->{column},
530     };
531 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
532 wakaba 1.1 length $token->{data})) {
533     $self->{s_kwd} = '';
534     }
535    
536     ## Stay in the data state.
537 wakaba 1.5 if (not $self->{is_xml} and
538     $self->{content_model} == PCDATA_CONTENT_MODEL) {
539 wakaba 1.1
540     $self->{state} = PCDATA_STATE;
541     } else {
542    
543     ## Stay in the state.
544     }
545    
546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
547     $self->{line_prev} = $self->{line};
548     $self->{column_prev} = $self->{column};
549     $self->{column}++;
550     $self->{nc}
551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
552     } else {
553     $self->{set_nc}->($self);
554     }
555    
556     return ($token);
557     redo A;
558     } elsif ($self->{state} == TAG_OPEN_STATE) {
559 wakaba 1.10 ## XML5: "tag state".
560    
561 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
562     if ($self->{nc} == 0x002F) { # /
563    
564    
565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
566     $self->{line_prev} = $self->{line};
567     $self->{column_prev} = $self->{column};
568     $self->{column}++;
569     $self->{nc}
570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
571     } else {
572     $self->{set_nc}->($self);
573     }
574    
575     $self->{state} = CLOSE_TAG_OPEN_STATE;
576     redo A;
577     } elsif ($self->{nc} == 0x0021) { # !
578    
579 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
580 wakaba 1.1 #
581     } else {
582    
583 wakaba 1.12 $self->{s_kwd} = '';
584 wakaba 1.1 #
585     }
586    
587     ## reconsume
588     $self->{state} = DATA_STATE;
589     return ({type => CHARACTER_TOKEN, data => '<',
590     line => $self->{line_prev},
591     column => $self->{column_prev},
592     });
593     redo A;
594     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
595     if ($self->{nc} == 0x0021) { # !
596    
597     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
598    
599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
600     $self->{line_prev} = $self->{line};
601     $self->{column_prev} = $self->{column};
602     $self->{column}++;
603     $self->{nc}
604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
605     } else {
606     $self->{set_nc}->($self);
607     }
608    
609     redo A;
610     } elsif ($self->{nc} == 0x002F) { # /
611    
612     $self->{state} = CLOSE_TAG_OPEN_STATE;
613    
614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
615     $self->{line_prev} = $self->{line};
616     $self->{column_prev} = $self->{column};
617     $self->{column}++;
618     $self->{nc}
619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
620     } else {
621     $self->{set_nc}->($self);
622     }
623    
624     redo A;
625     } elsif (0x0041 <= $self->{nc} and
626     $self->{nc} <= 0x005A) { # A..Z
627    
628     $self->{ct}
629     = {type => START_TAG_TOKEN,
630 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
631 wakaba 1.1 line => $self->{line_prev},
632     column => $self->{column_prev}};
633     $self->{state} = TAG_NAME_STATE;
634    
635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
636     $self->{line_prev} = $self->{line};
637     $self->{column_prev} = $self->{column};
638     $self->{column}++;
639     $self->{nc}
640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
641     } else {
642     $self->{set_nc}->($self);
643     }
644    
645     redo A;
646     } elsif (0x0061 <= $self->{nc} and
647     $self->{nc} <= 0x007A) { # a..z
648    
649     $self->{ct} = {type => START_TAG_TOKEN,
650     tag_name => chr ($self->{nc}),
651     line => $self->{line_prev},
652     column => $self->{column_prev}};
653     $self->{state} = TAG_NAME_STATE;
654    
655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
656     $self->{line_prev} = $self->{line};
657     $self->{column_prev} = $self->{column};
658     $self->{column}++;
659     $self->{nc}
660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
661     } else {
662     $self->{set_nc}->($self);
663     }
664    
665     redo A;
666     } elsif ($self->{nc} == 0x003E) { # >
667    
668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
669     line => $self->{line_prev},
670     column => $self->{column_prev});
671     $self->{state} = DATA_STATE;
672 wakaba 1.5 $self->{s_kwd} = '';
673 wakaba 1.1
674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
675     $self->{line_prev} = $self->{line};
676     $self->{column_prev} = $self->{column};
677     $self->{column}++;
678     $self->{nc}
679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
680     } else {
681     $self->{set_nc}->($self);
682     }
683    
684    
685     return ({type => CHARACTER_TOKEN, data => '<>',
686     line => $self->{line_prev},
687     column => $self->{column_prev},
688     });
689    
690     redo A;
691     } elsif ($self->{nc} == 0x003F) { # ?
692 wakaba 1.8 if ($self->{is_xml}) {
693    
694     $self->{state} = PI_STATE;
695    
696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
697     $self->{line_prev} = $self->{line};
698     $self->{column_prev} = $self->{column};
699     $self->{column}++;
700     $self->{nc}
701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
702     } else {
703     $self->{set_nc}->($self);
704     }
705    
706     redo A;
707     } else {
708    
709     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
710     line => $self->{line_prev},
711     column => $self->{column_prev});
712     $self->{state} = BOGUS_COMMENT_STATE;
713     $self->{ct} = {type => COMMENT_TOKEN, data => '',
714     line => $self->{line_prev},
715     column => $self->{column_prev},
716     };
717     ## $self->{nc} is intentionally left as is
718     redo A;
719     }
720 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
721 wakaba 1.1
722     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
723     line => $self->{line_prev},
724     column => $self->{column_prev});
725     $self->{state} = DATA_STATE;
726 wakaba 1.5 $self->{s_kwd} = '';
727 wakaba 1.1 ## reconsume
728    
729     return ({type => CHARACTER_TOKEN, data => '<',
730     line => $self->{line_prev},
731     column => $self->{column_prev},
732     });
733    
734     redo A;
735 wakaba 1.9 } else {
736     ## XML5: "<:" is a parse error.
737    
738     $self->{ct} = {type => START_TAG_TOKEN,
739     tag_name => chr ($self->{nc}),
740     line => $self->{line_prev},
741     column => $self->{column_prev}};
742     $self->{state} = TAG_NAME_STATE;
743    
744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
745     $self->{line_prev} = $self->{line};
746     $self->{column_prev} = $self->{column};
747     $self->{column}++;
748     $self->{nc}
749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
750     } else {
751     $self->{set_nc}->($self);
752     }
753    
754     redo A;
755 wakaba 1.1 }
756     } else {
757     die "$0: $self->{content_model} in tag open";
758     }
759     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
760     ## NOTE: The "close tag open state" in the spec is implemented as
761     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
762    
763 wakaba 1.10 ## XML5: "end tag state".
764    
765 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
766     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
767     if (defined $self->{last_stag_name}) {
768     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
769 wakaba 1.12 $self->{kwd} = '';
770 wakaba 1.1 ## Reconsume.
771     redo A;
772     } else {
773     ## No start tag token has ever been emitted
774     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
775    
776     $self->{state} = DATA_STATE;
777 wakaba 1.5 $self->{s_kwd} = '';
778 wakaba 1.1 ## Reconsume.
779     return ({type => CHARACTER_TOKEN, data => '</',
780     line => $l, column => $c,
781     });
782     redo A;
783     }
784     }
785    
786     if (0x0041 <= $self->{nc} and
787     $self->{nc} <= 0x005A) { # A..Z
788    
789     $self->{ct}
790     = {type => END_TAG_TOKEN,
791 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
792 wakaba 1.1 line => $l, column => $c};
793     $self->{state} = TAG_NAME_STATE;
794    
795     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
796     $self->{line_prev} = $self->{line};
797     $self->{column_prev} = $self->{column};
798     $self->{column}++;
799     $self->{nc}
800     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
801     } else {
802     $self->{set_nc}->($self);
803     }
804    
805     redo A;
806     } elsif (0x0061 <= $self->{nc} and
807     $self->{nc} <= 0x007A) { # a..z
808    
809     $self->{ct} = {type => END_TAG_TOKEN,
810     tag_name => chr ($self->{nc}),
811     line => $l, column => $c};
812     $self->{state} = TAG_NAME_STATE;
813    
814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
815     $self->{line_prev} = $self->{line};
816     $self->{column_prev} = $self->{column};
817     $self->{column}++;
818     $self->{nc}
819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
820     } else {
821     $self->{set_nc}->($self);
822     }
823    
824     redo A;
825     } elsif ($self->{nc} == 0x003E) { # >
826     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
827     line => $self->{line_prev}, ## "<" in "</>"
828     column => $self->{column_prev} - 1);
829     $self->{state} = DATA_STATE;
830 wakaba 1.5 $self->{s_kwd} = '';
831 wakaba 1.10 if ($self->{is_xml}) {
832    
833     ## XML5: No parse error.
834    
835     ## NOTE: This parser raises a parse error, since it supports
836     ## XML1, not XML5.
837    
838     ## NOTE: A short end tag token.
839     my $ct = {type => END_TAG_TOKEN,
840     tag_name => '',
841     line => $self->{line_prev},
842     column => $self->{column_prev} - 1,
843     };
844    
845     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
846     $self->{line_prev} = $self->{line};
847     $self->{column_prev} = $self->{column};
848     $self->{column}++;
849     $self->{nc}
850     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
851     } else {
852     $self->{set_nc}->($self);
853     }
854    
855     return ($ct);
856     } else {
857    
858    
859 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860     $self->{line_prev} = $self->{line};
861     $self->{column_prev} = $self->{column};
862     $self->{column}++;
863     $self->{nc}
864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
865     } else {
866     $self->{set_nc}->($self);
867     }
868    
869 wakaba 1.10 }
870 wakaba 1.1 redo A;
871     } elsif ($self->{nc} == -1) {
872    
873     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
874 wakaba 1.5 $self->{s_kwd} = '';
875 wakaba 1.1 $self->{state} = DATA_STATE;
876     # reconsume
877    
878     return ({type => CHARACTER_TOKEN, data => '</',
879     line => $l, column => $c,
880     });
881    
882     redo A;
883 wakaba 1.10 } elsif (not $self->{is_xml} or
884     $is_space->{$self->{nc}}) {
885 wakaba 1.1
886 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
887     line => $self->{line_prev}, # "<" of "</"
888     column => $self->{column_prev} - 1);
889 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
890     $self->{ct} = {type => COMMENT_TOKEN, data => '',
891     line => $self->{line_prev}, # "<" of "</"
892     column => $self->{column_prev} - 1,
893     };
894     ## NOTE: $self->{nc} is intentionally left as is.
895     ## Although the "anything else" case of the spec not explicitly
896     ## states that the next input character is to be reconsumed,
897     ## it will be included to the |data| of the comment token
898     ## generated from the bogus end tag, as defined in the
899     ## "bogus comment state" entry.
900     redo A;
901 wakaba 1.10 } else {
902     ## XML5: "</:" is a parse error.
903    
904     $self->{ct} = {type => END_TAG_TOKEN,
905     tag_name => chr ($self->{nc}),
906     line => $l, column => $c};
907     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
908    
909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
910     $self->{line_prev} = $self->{line};
911     $self->{column_prev} = $self->{column};
912     $self->{column}++;
913     $self->{nc}
914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
915     } else {
916     $self->{set_nc}->($self);
917     }
918    
919     redo A;
920 wakaba 1.1 }
921     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
922 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
923 wakaba 1.1 if (length $ch) {
924     my $CH = $ch;
925     $ch =~ tr/a-z/A-Z/;
926     my $nch = chr $self->{nc};
927     if ($nch eq $ch or $nch eq $CH) {
928    
929     ## Stay in the state.
930 wakaba 1.12 $self->{kwd} .= $nch;
931 wakaba 1.1
932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933     $self->{line_prev} = $self->{line};
934     $self->{column_prev} = $self->{column};
935     $self->{column}++;
936     $self->{nc}
937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938     } else {
939     $self->{set_nc}->($self);
940     }
941    
942     redo A;
943     } else {
944    
945     $self->{state} = DATA_STATE;
946 wakaba 1.5 $self->{s_kwd} = '';
947 wakaba 1.1 ## Reconsume.
948     return ({type => CHARACTER_TOKEN,
949 wakaba 1.12 data => '</' . $self->{kwd},
950 wakaba 1.1 line => $self->{line_prev},
951 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
952 wakaba 1.1 });
953     redo A;
954     }
955     } else { # after "<{tag-name}"
956     unless ($is_space->{$self->{nc}} or
957     {
958     0x003E => 1, # >
959     0x002F => 1, # /
960     -1 => 1, # EOF
961     }->{$self->{nc}}) {
962    
963     ## Reconsume.
964     $self->{state} = DATA_STATE;
965 wakaba 1.5 $self->{s_kwd} = '';
966 wakaba 1.1 return ({type => CHARACTER_TOKEN,
967 wakaba 1.12 data => '</' . $self->{kwd},
968 wakaba 1.1 line => $self->{line_prev},
969 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
970 wakaba 1.1 });
971     redo A;
972     } else {
973    
974     $self->{ct}
975     = {type => END_TAG_TOKEN,
976     tag_name => $self->{last_stag_name},
977     line => $self->{line_prev},
978 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
979 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
980     ## Reconsume.
981     redo A;
982     }
983     }
984     } elsif ($self->{state} == TAG_NAME_STATE) {
985     if ($is_space->{$self->{nc}}) {
986    
987     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
988    
989     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
990     $self->{line_prev} = $self->{line};
991     $self->{column_prev} = $self->{column};
992     $self->{column}++;
993     $self->{nc}
994     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
995     } else {
996     $self->{set_nc}->($self);
997     }
998    
999     redo A;
1000     } elsif ($self->{nc} == 0x003E) { # >
1001     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1002    
1003     $self->{last_stag_name} = $self->{ct}->{tag_name};
1004     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1005     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1006     #if ($self->{ct}->{attributes}) {
1007     # ## NOTE: This should never be reached.
1008     # !!! cp (36);
1009     # !!! parse-error (type => 'end tag attribute');
1010     #} else {
1011    
1012     #}
1013     } else {
1014     die "$0: $self->{ct}->{type}: Unknown token type";
1015     }
1016     $self->{state} = DATA_STATE;
1017 wakaba 1.5 $self->{s_kwd} = '';
1018 wakaba 1.1
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029    
1030     return ($self->{ct}); # start tag or end tag
1031    
1032     redo A;
1033     } elsif (0x0041 <= $self->{nc} and
1034     $self->{nc} <= 0x005A) { # A..Z
1035    
1036 wakaba 1.4 $self->{ct}->{tag_name}
1037     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1038 wakaba 1.1 # start tag or end tag
1039     ## Stay in this state
1040    
1041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1042     $self->{line_prev} = $self->{line};
1043     $self->{column_prev} = $self->{column};
1044     $self->{column}++;
1045     $self->{nc}
1046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1047     } else {
1048     $self->{set_nc}->($self);
1049     }
1050    
1051     redo A;
1052     } elsif ($self->{nc} == -1) {
1053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1054     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1055    
1056     $self->{last_stag_name} = $self->{ct}->{tag_name};
1057     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1058     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1059     #if ($self->{ct}->{attributes}) {
1060     # ## NOTE: This state should never be reached.
1061     # !!! cp (40);
1062     # !!! parse-error (type => 'end tag attribute');
1063     #} else {
1064    
1065     #}
1066     } else {
1067     die "$0: $self->{ct}->{type}: Unknown token type";
1068     }
1069     $self->{state} = DATA_STATE;
1070 wakaba 1.5 $self->{s_kwd} = '';
1071 wakaba 1.1 # reconsume
1072    
1073     return ($self->{ct}); # start tag or end tag
1074    
1075     redo A;
1076     } elsif ($self->{nc} == 0x002F) { # /
1077    
1078     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1079    
1080     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1081     $self->{line_prev} = $self->{line};
1082     $self->{column_prev} = $self->{column};
1083     $self->{column}++;
1084     $self->{nc}
1085     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1086     } else {
1087     $self->{set_nc}->($self);
1088     }
1089    
1090     redo A;
1091     } else {
1092    
1093     $self->{ct}->{tag_name} .= chr $self->{nc};
1094     # start tag or end tag
1095     ## Stay in the state
1096    
1097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1098     $self->{line_prev} = $self->{line};
1099     $self->{column_prev} = $self->{column};
1100     $self->{column}++;
1101     $self->{nc}
1102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1103     } else {
1104     $self->{set_nc}->($self);
1105     }
1106    
1107     redo A;
1108     }
1109     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1110 wakaba 1.11 ## XML5: "Tag attribute name before state".
1111    
1112 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1113    
1114     ## Stay in the state
1115    
1116     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1117     $self->{line_prev} = $self->{line};
1118     $self->{column_prev} = $self->{column};
1119     $self->{column}++;
1120     $self->{nc}
1121     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1122     } else {
1123     $self->{set_nc}->($self);
1124     }
1125    
1126     redo A;
1127     } elsif ($self->{nc} == 0x003E) { # >
1128     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1129    
1130     $self->{last_stag_name} = $self->{ct}->{tag_name};
1131     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1132     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1133     if ($self->{ct}->{attributes}) {
1134    
1135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1136     } else {
1137    
1138     }
1139     } else {
1140     die "$0: $self->{ct}->{type}: Unknown token type";
1141     }
1142     $self->{state} = DATA_STATE;
1143 wakaba 1.5 $self->{s_kwd} = '';
1144 wakaba 1.1
1145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1146     $self->{line_prev} = $self->{line};
1147     $self->{column_prev} = $self->{column};
1148     $self->{column}++;
1149     $self->{nc}
1150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1151     } else {
1152     $self->{set_nc}->($self);
1153     }
1154    
1155    
1156     return ($self->{ct}); # start tag or end tag
1157    
1158     redo A;
1159     } elsif (0x0041 <= $self->{nc} and
1160     $self->{nc} <= 0x005A) { # A..Z
1161    
1162     $self->{ca}
1163 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1164 wakaba 1.1 value => '',
1165     line => $self->{line}, column => $self->{column}};
1166     $self->{state} = ATTRIBUTE_NAME_STATE;
1167    
1168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1169     $self->{line_prev} = $self->{line};
1170     $self->{column_prev} = $self->{column};
1171     $self->{column}++;
1172     $self->{nc}
1173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1174     } else {
1175     $self->{set_nc}->($self);
1176     }
1177    
1178     redo A;
1179     } elsif ($self->{nc} == 0x002F) { # /
1180    
1181     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1182    
1183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1184     $self->{line_prev} = $self->{line};
1185     $self->{column_prev} = $self->{column};
1186     $self->{column}++;
1187     $self->{nc}
1188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1189     } else {
1190     $self->{set_nc}->($self);
1191     }
1192    
1193     redo A;
1194     } elsif ($self->{nc} == -1) {
1195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1196     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1197    
1198     $self->{last_stag_name} = $self->{ct}->{tag_name};
1199     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1200     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1201     if ($self->{ct}->{attributes}) {
1202    
1203     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1204     } else {
1205    
1206     }
1207     } else {
1208     die "$0: $self->{ct}->{type}: Unknown token type";
1209     }
1210     $self->{state} = DATA_STATE;
1211 wakaba 1.5 $self->{s_kwd} = '';
1212 wakaba 1.1 # reconsume
1213    
1214     return ($self->{ct}); # start tag or end tag
1215    
1216     redo A;
1217     } else {
1218     if ({
1219     0x0022 => 1, # "
1220     0x0027 => 1, # '
1221     0x003D => 1, # =
1222     }->{$self->{nc}}) {
1223    
1224 wakaba 1.11 ## XML5: Not a parse error.
1225 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1226     } else {
1227    
1228 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1229 wakaba 1.1 }
1230     $self->{ca}
1231     = {name => chr ($self->{nc}),
1232     value => '',
1233     line => $self->{line}, column => $self->{column}};
1234     $self->{state} = ATTRIBUTE_NAME_STATE;
1235    
1236     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1237     $self->{line_prev} = $self->{line};
1238     $self->{column_prev} = $self->{column};
1239     $self->{column}++;
1240     $self->{nc}
1241     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1242     } else {
1243     $self->{set_nc}->($self);
1244     }
1245    
1246     redo A;
1247     }
1248     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1249 wakaba 1.11 ## XML5: "Tag attribute name state".
1250    
1251 wakaba 1.1 my $before_leave = sub {
1252     if (exists $self->{ct}->{attributes} # start tag or end tag
1253     ->{$self->{ca}->{name}}) { # MUST
1254    
1255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1256     ## Discard $self->{ca} # MUST
1257     } else {
1258    
1259     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1260     = $self->{ca};
1261 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1262 wakaba 1.1 }
1263     }; # $before_leave
1264    
1265     if ($is_space->{$self->{nc}}) {
1266    
1267     $before_leave->();
1268     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1269    
1270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1271     $self->{line_prev} = $self->{line};
1272     $self->{column_prev} = $self->{column};
1273     $self->{column}++;
1274     $self->{nc}
1275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1276     } else {
1277     $self->{set_nc}->($self);
1278     }
1279    
1280     redo A;
1281     } elsif ($self->{nc} == 0x003D) { # =
1282    
1283     $before_leave->();
1284     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1285    
1286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1287     $self->{line_prev} = $self->{line};
1288     $self->{column_prev} = $self->{column};
1289     $self->{column}++;
1290     $self->{nc}
1291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1292     } else {
1293     $self->{set_nc}->($self);
1294     }
1295    
1296     redo A;
1297     } elsif ($self->{nc} == 0x003E) { # >
1298 wakaba 1.11 if ($self->{is_xml}) {
1299    
1300     ## XML5: Not a parse error.
1301     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1302     } else {
1303    
1304     }
1305    
1306 wakaba 1.1 $before_leave->();
1307     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1308    
1309     $self->{last_stag_name} = $self->{ct}->{tag_name};
1310     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1311    
1312     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1313     if ($self->{ct}->{attributes}) {
1314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1315     }
1316     } else {
1317     die "$0: $self->{ct}->{type}: Unknown token type";
1318     }
1319     $self->{state} = DATA_STATE;
1320 wakaba 1.5 $self->{s_kwd} = '';
1321 wakaba 1.1
1322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1323     $self->{line_prev} = $self->{line};
1324     $self->{column_prev} = $self->{column};
1325     $self->{column}++;
1326     $self->{nc}
1327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1328     } else {
1329     $self->{set_nc}->($self);
1330     }
1331    
1332    
1333     return ($self->{ct}); # start tag or end tag
1334    
1335     redo A;
1336     } elsif (0x0041 <= $self->{nc} and
1337     $self->{nc} <= 0x005A) { # A..Z
1338    
1339 wakaba 1.4 $self->{ca}->{name}
1340     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1341 wakaba 1.1 ## Stay in the state
1342    
1343     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1344     $self->{line_prev} = $self->{line};
1345     $self->{column_prev} = $self->{column};
1346     $self->{column}++;
1347     $self->{nc}
1348     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1349     } else {
1350     $self->{set_nc}->($self);
1351     }
1352    
1353     redo A;
1354     } elsif ($self->{nc} == 0x002F) { # /
1355 wakaba 1.11 if ($self->{is_xml}) {
1356    
1357     ## XML5: Not a parse error.
1358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1359     } else {
1360    
1361     }
1362 wakaba 1.1
1363     $before_leave->();
1364     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1365    
1366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1367     $self->{line_prev} = $self->{line};
1368     $self->{column_prev} = $self->{column};
1369     $self->{column}++;
1370     $self->{nc}
1371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1372     } else {
1373     $self->{set_nc}->($self);
1374     }
1375    
1376     redo A;
1377     } elsif ($self->{nc} == -1) {
1378     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1379     $before_leave->();
1380     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1381    
1382     $self->{last_stag_name} = $self->{ct}->{tag_name};
1383     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1384     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1385     if ($self->{ct}->{attributes}) {
1386    
1387     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1388     } else {
1389     ## NOTE: This state should never be reached.
1390    
1391     }
1392     } else {
1393     die "$0: $self->{ct}->{type}: Unknown token type";
1394     }
1395     $self->{state} = DATA_STATE;
1396 wakaba 1.5 $self->{s_kwd} = '';
1397 wakaba 1.1 # reconsume
1398    
1399     return ($self->{ct}); # start tag or end tag
1400    
1401     redo A;
1402     } else {
1403     if ($self->{nc} == 0x0022 or # "
1404     $self->{nc} == 0x0027) { # '
1405    
1406 wakaba 1.11 ## XML5: Not a parse error.
1407 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1408     } else {
1409    
1410     }
1411     $self->{ca}->{name} .= chr ($self->{nc});
1412     ## Stay in the state
1413    
1414     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1415     $self->{line_prev} = $self->{line};
1416     $self->{column_prev} = $self->{column};
1417     $self->{column}++;
1418     $self->{nc}
1419     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1420     } else {
1421     $self->{set_nc}->($self);
1422     }
1423    
1424     redo A;
1425     }
1426     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1427 wakaba 1.11 ## XML5: "Tag attribute name after state".
1428    
1429 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1430    
1431     ## Stay in the state
1432    
1433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1434     $self->{line_prev} = $self->{line};
1435     $self->{column_prev} = $self->{column};
1436     $self->{column}++;
1437     $self->{nc}
1438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1439     } else {
1440     $self->{set_nc}->($self);
1441     }
1442    
1443     redo A;
1444     } elsif ($self->{nc} == 0x003D) { # =
1445    
1446     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1447    
1448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1449     $self->{line_prev} = $self->{line};
1450     $self->{column_prev} = $self->{column};
1451     $self->{column}++;
1452     $self->{nc}
1453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1454     } else {
1455     $self->{set_nc}->($self);
1456     }
1457    
1458     redo A;
1459     } elsif ($self->{nc} == 0x003E) { # >
1460 wakaba 1.11 if ($self->{is_xml}) {
1461    
1462     ## XML5: Not a parse error.
1463     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1464     } else {
1465    
1466     }
1467    
1468 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1469    
1470     $self->{last_stag_name} = $self->{ct}->{tag_name};
1471     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1472     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1473     if ($self->{ct}->{attributes}) {
1474    
1475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1476     } else {
1477     ## NOTE: This state should never be reached.
1478    
1479     }
1480     } else {
1481     die "$0: $self->{ct}->{type}: Unknown token type";
1482     }
1483     $self->{state} = DATA_STATE;
1484 wakaba 1.5 $self->{s_kwd} = '';
1485 wakaba 1.1
1486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1487     $self->{line_prev} = $self->{line};
1488     $self->{column_prev} = $self->{column};
1489     $self->{column}++;
1490     $self->{nc}
1491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1492     } else {
1493     $self->{set_nc}->($self);
1494     }
1495    
1496    
1497     return ($self->{ct}); # start tag or end tag
1498    
1499     redo A;
1500     } elsif (0x0041 <= $self->{nc} and
1501     $self->{nc} <= 0x005A) { # A..Z
1502    
1503     $self->{ca}
1504 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1505 wakaba 1.1 value => '',
1506     line => $self->{line}, column => $self->{column}};
1507     $self->{state} = ATTRIBUTE_NAME_STATE;
1508    
1509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1510     $self->{line_prev} = $self->{line};
1511     $self->{column_prev} = $self->{column};
1512     $self->{column}++;
1513     $self->{nc}
1514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1515     } else {
1516     $self->{set_nc}->($self);
1517     }
1518    
1519     redo A;
1520     } elsif ($self->{nc} == 0x002F) { # /
1521 wakaba 1.11 if ($self->{is_xml}) {
1522    
1523     ## XML5: Not a parse error.
1524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525     } else {
1526    
1527     }
1528 wakaba 1.1
1529     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1530    
1531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1532     $self->{line_prev} = $self->{line};
1533     $self->{column_prev} = $self->{column};
1534     $self->{column}++;
1535     $self->{nc}
1536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1537     } else {
1538     $self->{set_nc}->($self);
1539     }
1540    
1541     redo A;
1542     } elsif ($self->{nc} == -1) {
1543     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1544     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1545    
1546     $self->{last_stag_name} = $self->{ct}->{tag_name};
1547     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1548     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1549     if ($self->{ct}->{attributes}) {
1550    
1551     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1552     } else {
1553     ## NOTE: This state should never be reached.
1554    
1555     }
1556     } else {
1557     die "$0: $self->{ct}->{type}: Unknown token type";
1558     }
1559 wakaba 1.5 $self->{s_kwd} = '';
1560 wakaba 1.1 $self->{state} = DATA_STATE;
1561     # reconsume
1562    
1563     return ($self->{ct}); # start tag or end tag
1564    
1565     redo A;
1566     } else {
1567 wakaba 1.11 if ($self->{is_xml}) {
1568    
1569     ## XML5: Not a parse error.
1570     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1571     } else {
1572    
1573     }
1574    
1575 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1576     $self->{nc} == 0x0027) { # '
1577    
1578 wakaba 1.11 ## XML5: Not a parse error.
1579 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1580     } else {
1581    
1582     }
1583     $self->{ca}
1584     = {name => chr ($self->{nc}),
1585     value => '',
1586     line => $self->{line}, column => $self->{column}};
1587     $self->{state} = ATTRIBUTE_NAME_STATE;
1588    
1589     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1590     $self->{line_prev} = $self->{line};
1591     $self->{column_prev} = $self->{column};
1592     $self->{column}++;
1593     $self->{nc}
1594     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1595     } else {
1596     $self->{set_nc}->($self);
1597     }
1598    
1599     redo A;
1600     }
1601     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1602 wakaba 1.11 ## XML5: "Tag attribute value before state".
1603    
1604 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1605    
1606     ## Stay in the state
1607    
1608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1609     $self->{line_prev} = $self->{line};
1610     $self->{column_prev} = $self->{column};
1611     $self->{column}++;
1612     $self->{nc}
1613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1614     } else {
1615     $self->{set_nc}->($self);
1616     }
1617    
1618     redo A;
1619     } elsif ($self->{nc} == 0x0022) { # "
1620    
1621     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1622    
1623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1624     $self->{line_prev} = $self->{line};
1625     $self->{column_prev} = $self->{column};
1626     $self->{column}++;
1627     $self->{nc}
1628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1629     } else {
1630     $self->{set_nc}->($self);
1631     }
1632    
1633     redo A;
1634     } elsif ($self->{nc} == 0x0026) { # &
1635    
1636     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1637     ## reconsume
1638     redo A;
1639     } elsif ($self->{nc} == 0x0027) { # '
1640    
1641     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1642    
1643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644     $self->{line_prev} = $self->{line};
1645     $self->{column_prev} = $self->{column};
1646     $self->{column}++;
1647     $self->{nc}
1648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649     } else {
1650     $self->{set_nc}->($self);
1651     }
1652    
1653     redo A;
1654     } elsif ($self->{nc} == 0x003E) { # >
1655     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1656     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657    
1658     $self->{last_stag_name} = $self->{ct}->{tag_name};
1659     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661     if ($self->{ct}->{attributes}) {
1662    
1663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664     } else {
1665     ## NOTE: This state should never be reached.
1666    
1667     }
1668     } else {
1669     die "$0: $self->{ct}->{type}: Unknown token type";
1670     }
1671     $self->{state} = DATA_STATE;
1672 wakaba 1.5 $self->{s_kwd} = '';
1673 wakaba 1.1
1674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1675     $self->{line_prev} = $self->{line};
1676     $self->{column_prev} = $self->{column};
1677     $self->{column}++;
1678     $self->{nc}
1679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1680     } else {
1681     $self->{set_nc}->($self);
1682     }
1683    
1684    
1685     return ($self->{ct}); # start tag or end tag
1686    
1687     redo A;
1688     } elsif ($self->{nc} == -1) {
1689     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1690     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1691    
1692     $self->{last_stag_name} = $self->{ct}->{tag_name};
1693     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1694     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1695     if ($self->{ct}->{attributes}) {
1696    
1697     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1698     } else {
1699     ## NOTE: This state should never be reached.
1700    
1701     }
1702     } else {
1703     die "$0: $self->{ct}->{type}: Unknown token type";
1704     }
1705     $self->{state} = DATA_STATE;
1706 wakaba 1.5 $self->{s_kwd} = '';
1707 wakaba 1.1 ## reconsume
1708    
1709     return ($self->{ct}); # start tag or end tag
1710    
1711     redo A;
1712     } else {
1713     if ($self->{nc} == 0x003D) { # =
1714    
1715 wakaba 1.11 ## XML5: Not a parse error.
1716 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1717 wakaba 1.11 } elsif ($self->{is_xml}) {
1718    
1719     ## XML5: No parse error.
1720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1721 wakaba 1.1 } else {
1722    
1723     }
1724     $self->{ca}->{value} .= chr ($self->{nc});
1725     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1726    
1727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1728     $self->{line_prev} = $self->{line};
1729     $self->{column_prev} = $self->{column};
1730     $self->{column}++;
1731     $self->{nc}
1732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1733     } else {
1734     $self->{set_nc}->($self);
1735     }
1736    
1737     redo A;
1738     }
1739     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1740 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1741    
1742 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1743    
1744 wakaba 1.11 ## XML5: "Tag attribute name before state".
1745 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1746    
1747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1748     $self->{line_prev} = $self->{line};
1749     $self->{column_prev} = $self->{column};
1750     $self->{column}++;
1751     $self->{nc}
1752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1753     } else {
1754     $self->{set_nc}->($self);
1755     }
1756    
1757     redo A;
1758     } elsif ($self->{nc} == 0x0026) { # &
1759    
1760 wakaba 1.11 ## XML5: Not defined yet.
1761    
1762 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1763     ## "entity in attribute value state". In this implementation, the
1764     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1765     ## implementation of the "consume a character reference" algorithm.
1766     $self->{prev_state} = $self->{state};
1767     $self->{entity_add} = 0x0022; # "
1768     $self->{state} = ENTITY_STATE;
1769    
1770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1771     $self->{line_prev} = $self->{line};
1772     $self->{column_prev} = $self->{column};
1773     $self->{column}++;
1774     $self->{nc}
1775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1776     } else {
1777     $self->{set_nc}->($self);
1778     }
1779    
1780     redo A;
1781     } elsif ($self->{nc} == -1) {
1782     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1783     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1784    
1785     $self->{last_stag_name} = $self->{ct}->{tag_name};
1786     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1787     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1788     if ($self->{ct}->{attributes}) {
1789    
1790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1791     } else {
1792     ## NOTE: This state should never be reached.
1793    
1794     }
1795     } else {
1796     die "$0: $self->{ct}->{type}: Unknown token type";
1797     }
1798     $self->{state} = DATA_STATE;
1799 wakaba 1.5 $self->{s_kwd} = '';
1800 wakaba 1.1 ## reconsume
1801    
1802     return ($self->{ct}); # start tag or end tag
1803    
1804     redo A;
1805     } else {
1806 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1807    
1808     ## XML5: Not a parse error.
1809     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1810     } else {
1811    
1812     }
1813 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1814     $self->{read_until}->($self->{ca}->{value},
1815 wakaba 1.11 q["&<],
1816 wakaba 1.1 length $self->{ca}->{value});
1817    
1818     ## Stay in the state
1819    
1820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1821     $self->{line_prev} = $self->{line};
1822     $self->{column_prev} = $self->{column};
1823     $self->{column}++;
1824     $self->{nc}
1825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1826     } else {
1827     $self->{set_nc}->($self);
1828     }
1829    
1830     redo A;
1831     }
1832     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1833 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1834    
1835 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1836    
1837 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1838 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1839    
1840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1841     $self->{line_prev} = $self->{line};
1842     $self->{column_prev} = $self->{column};
1843     $self->{column}++;
1844     $self->{nc}
1845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1846     } else {
1847     $self->{set_nc}->($self);
1848     }
1849    
1850     redo A;
1851     } elsif ($self->{nc} == 0x0026) { # &
1852    
1853 wakaba 1.11 ## XML5: Not defined yet.
1854    
1855 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1856     ## "entity in attribute value state". In this implementation, the
1857     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1858     ## implementation of the "consume a character reference" algorithm.
1859     $self->{entity_add} = 0x0027; # '
1860     $self->{prev_state} = $self->{state};
1861     $self->{state} = ENTITY_STATE;
1862    
1863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1864     $self->{line_prev} = $self->{line};
1865     $self->{column_prev} = $self->{column};
1866     $self->{column}++;
1867     $self->{nc}
1868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1869     } else {
1870     $self->{set_nc}->($self);
1871     }
1872    
1873     redo A;
1874     } elsif ($self->{nc} == -1) {
1875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1876     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1877    
1878     $self->{last_stag_name} = $self->{ct}->{tag_name};
1879     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1880     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1881     if ($self->{ct}->{attributes}) {
1882    
1883     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1884     } else {
1885     ## NOTE: This state should never be reached.
1886    
1887     }
1888     } else {
1889     die "$0: $self->{ct}->{type}: Unknown token type";
1890     }
1891     $self->{state} = DATA_STATE;
1892 wakaba 1.5 $self->{s_kwd} = '';
1893 wakaba 1.1 ## reconsume
1894    
1895     return ($self->{ct}); # start tag or end tag
1896    
1897     redo A;
1898     } else {
1899 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1900    
1901     ## XML5: Not a parse error.
1902     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1903     } else {
1904    
1905     }
1906 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1907     $self->{read_until}->($self->{ca}->{value},
1908 wakaba 1.11 q['&<],
1909 wakaba 1.1 length $self->{ca}->{value});
1910    
1911     ## Stay in the state
1912    
1913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1914     $self->{line_prev} = $self->{line};
1915     $self->{column_prev} = $self->{column};
1916     $self->{column}++;
1917     $self->{nc}
1918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1919     } else {
1920     $self->{set_nc}->($self);
1921     }
1922    
1923     redo A;
1924     }
1925     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1926 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1927    
1928 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1929    
1930 wakaba 1.11 ## XML5: "Tag attribute name before state".
1931 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1932    
1933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1934     $self->{line_prev} = $self->{line};
1935     $self->{column_prev} = $self->{column};
1936     $self->{column}++;
1937     $self->{nc}
1938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1939     } else {
1940     $self->{set_nc}->($self);
1941     }
1942    
1943     redo A;
1944     } elsif ($self->{nc} == 0x0026) { # &
1945    
1946 wakaba 1.11
1947     ## XML5: Not defined yet.
1948    
1949 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1950     ## "entity in attribute value state". In this implementation, the
1951     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1952     ## implementation of the "consume a character reference" algorithm.
1953     $self->{entity_add} = -1;
1954     $self->{prev_state} = $self->{state};
1955     $self->{state} = ENTITY_STATE;
1956    
1957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1958     $self->{line_prev} = $self->{line};
1959     $self->{column_prev} = $self->{column};
1960     $self->{column}++;
1961     $self->{nc}
1962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1963     } else {
1964     $self->{set_nc}->($self);
1965     }
1966    
1967     redo A;
1968     } elsif ($self->{nc} == 0x003E) { # >
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1973     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1974     if ($self->{ct}->{attributes}) {
1975    
1976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1977     } else {
1978     ## NOTE: This state should never be reached.
1979    
1980     }
1981     } else {
1982     die "$0: $self->{ct}->{type}: Unknown token type";
1983     }
1984     $self->{state} = DATA_STATE;
1985 wakaba 1.5 $self->{s_kwd} = '';
1986 wakaba 1.1
1987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1988     $self->{line_prev} = $self->{line};
1989     $self->{column_prev} = $self->{column};
1990     $self->{column}++;
1991     $self->{nc}
1992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1993     } else {
1994     $self->{set_nc}->($self);
1995     }
1996    
1997    
1998     return ($self->{ct}); # start tag or end tag
1999    
2000     redo A;
2001     } elsif ($self->{nc} == -1) {
2002     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2003     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2004    
2005     $self->{last_stag_name} = $self->{ct}->{tag_name};
2006     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2007     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2008     if ($self->{ct}->{attributes}) {
2009    
2010     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2011     } else {
2012     ## NOTE: This state should never be reached.
2013    
2014     }
2015     } else {
2016     die "$0: $self->{ct}->{type}: Unknown token type";
2017     }
2018     $self->{state} = DATA_STATE;
2019 wakaba 1.5 $self->{s_kwd} = '';
2020 wakaba 1.1 ## reconsume
2021    
2022     return ($self->{ct}); # start tag or end tag
2023    
2024     redo A;
2025     } else {
2026     if ({
2027     0x0022 => 1, # "
2028     0x0027 => 1, # '
2029     0x003D => 1, # =
2030     }->{$self->{nc}}) {
2031    
2032 wakaba 1.11 ## XML5: Not a parse error.
2033 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2034     } else {
2035    
2036     }
2037     $self->{ca}->{value} .= chr ($self->{nc});
2038     $self->{read_until}->($self->{ca}->{value},
2039     q["'=& >],
2040     length $self->{ca}->{value});
2041    
2042     ## Stay in the state
2043    
2044     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2045     $self->{line_prev} = $self->{line};
2046     $self->{column_prev} = $self->{column};
2047     $self->{column}++;
2048     $self->{nc}
2049     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2050     } else {
2051     $self->{set_nc}->($self);
2052     }
2053    
2054     redo A;
2055     }
2056     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2057     if ($is_space->{$self->{nc}}) {
2058    
2059     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2060    
2061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2062     $self->{line_prev} = $self->{line};
2063     $self->{column_prev} = $self->{column};
2064     $self->{column}++;
2065     $self->{nc}
2066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2067     } else {
2068     $self->{set_nc}->($self);
2069     }
2070    
2071     redo A;
2072     } elsif ($self->{nc} == 0x003E) { # >
2073     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2074    
2075     $self->{last_stag_name} = $self->{ct}->{tag_name};
2076     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2077     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2078     if ($self->{ct}->{attributes}) {
2079    
2080     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2081     } else {
2082     ## NOTE: This state should never be reached.
2083    
2084     }
2085     } else {
2086     die "$0: $self->{ct}->{type}: Unknown token type";
2087     }
2088     $self->{state} = DATA_STATE;
2089 wakaba 1.5 $self->{s_kwd} = '';
2090 wakaba 1.1
2091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2092     $self->{line_prev} = $self->{line};
2093     $self->{column_prev} = $self->{column};
2094     $self->{column}++;
2095     $self->{nc}
2096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2097     } else {
2098     $self->{set_nc}->($self);
2099     }
2100    
2101    
2102     return ($self->{ct}); # start tag or end tag
2103    
2104     redo A;
2105     } elsif ($self->{nc} == 0x002F) { # /
2106    
2107     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2108    
2109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2110     $self->{line_prev} = $self->{line};
2111     $self->{column_prev} = $self->{column};
2112     $self->{column}++;
2113     $self->{nc}
2114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2115     } else {
2116     $self->{set_nc}->($self);
2117     }
2118    
2119     redo A;
2120     } elsif ($self->{nc} == -1) {
2121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2122     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2123    
2124     $self->{last_stag_name} = $self->{ct}->{tag_name};
2125     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2126     if ($self->{ct}->{attributes}) {
2127    
2128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2129     } else {
2130     ## NOTE: This state should never be reached.
2131    
2132     }
2133     } else {
2134     die "$0: $self->{ct}->{type}: Unknown token type";
2135     }
2136     $self->{state} = DATA_STATE;
2137 wakaba 1.5 $self->{s_kwd} = '';
2138 wakaba 1.1 ## Reconsume.
2139     return ($self->{ct}); # start tag or end tag
2140     redo A;
2141     } else {
2142    
2143     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2144     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2145     ## reconsume
2146     redo A;
2147     }
2148     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2149 wakaba 1.11 ## XML5: "Empty tag state".
2150    
2151 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2152     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2153    
2154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2155     ## TODO: Different type than slash in start tag
2156     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2157     if ($self->{ct}->{attributes}) {
2158    
2159     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2160     } else {
2161    
2162     }
2163     ## TODO: Test |<title></title/>|
2164     } else {
2165    
2166     $self->{self_closing} = 1;
2167     }
2168    
2169     $self->{state} = DATA_STATE;
2170 wakaba 1.5 $self->{s_kwd} = '';
2171 wakaba 1.1
2172     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2173     $self->{line_prev} = $self->{line};
2174     $self->{column_prev} = $self->{column};
2175     $self->{column}++;
2176     $self->{nc}
2177     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2178     } else {
2179     $self->{set_nc}->($self);
2180     }
2181    
2182    
2183     return ($self->{ct}); # start tag or end tag
2184    
2185     redo A;
2186     } elsif ($self->{nc} == -1) {
2187     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2188     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2189    
2190     $self->{last_stag_name} = $self->{ct}->{tag_name};
2191     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2192     if ($self->{ct}->{attributes}) {
2193    
2194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2195     } else {
2196     ## NOTE: This state should never be reached.
2197    
2198     }
2199     } else {
2200     die "$0: $self->{ct}->{type}: Unknown token type";
2201     }
2202 wakaba 1.11 ## XML5: "Tag attribute name before state".
2203 wakaba 1.1 $self->{state} = DATA_STATE;
2204 wakaba 1.5 $self->{s_kwd} = '';
2205 wakaba 1.1 ## Reconsume.
2206     return ($self->{ct}); # start tag or end tag
2207     redo A;
2208     } else {
2209    
2210     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2211     ## TODO: This error type is wrong.
2212     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2213     ## Reconsume.
2214     redo A;
2215     }
2216     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2217 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2218    
2219 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2220     ## consumes characters one-by-one basis.
2221    
2222     if ($self->{nc} == 0x003E) { # >
2223 wakaba 1.13 if ($self->{in_subset}) {
2224    
2225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2226     } else {
2227    
2228     $self->{state} = DATA_STATE;
2229     $self->{s_kwd} = '';
2230     }
2231 wakaba 1.1
2232     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2233     $self->{line_prev} = $self->{line};
2234     $self->{column_prev} = $self->{column};
2235     $self->{column}++;
2236     $self->{nc}
2237     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2238     } else {
2239     $self->{set_nc}->($self);
2240     }
2241    
2242    
2243     return ($self->{ct}); # comment
2244     redo A;
2245     } elsif ($self->{nc} == -1) {
2246 wakaba 1.13 if ($self->{in_subset}) {
2247    
2248     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249     } else {
2250    
2251     $self->{state} = DATA_STATE;
2252     $self->{s_kwd} = '';
2253     }
2254 wakaba 1.1 ## reconsume
2255    
2256     return ($self->{ct}); # comment
2257     redo A;
2258     } else {
2259    
2260     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2261     $self->{read_until}->($self->{ct}->{data},
2262     q[>],
2263     length $self->{ct}->{data});
2264    
2265     ## Stay in the state.
2266    
2267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2268     $self->{line_prev} = $self->{line};
2269     $self->{column_prev} = $self->{column};
2270     $self->{column}++;
2271     $self->{nc}
2272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2273     } else {
2274     $self->{set_nc}->($self);
2275     }
2276    
2277     redo A;
2278     }
2279     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2280 wakaba 1.14 ## XML5: "Markup declaration state".
2281 wakaba 1.1
2282     if ($self->{nc} == 0x002D) { # -
2283    
2284     $self->{state} = MD_HYPHEN_STATE;
2285    
2286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2287     $self->{line_prev} = $self->{line};
2288     $self->{column_prev} = $self->{column};
2289     $self->{column}++;
2290     $self->{nc}
2291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2292     } else {
2293     $self->{set_nc}->($self);
2294     }
2295    
2296     redo A;
2297     } elsif ($self->{nc} == 0x0044 or # D
2298     $self->{nc} == 0x0064) { # d
2299     ## ASCII case-insensitive.
2300    
2301     $self->{state} = MD_DOCTYPE_STATE;
2302 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2303 wakaba 1.1
2304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2305     $self->{line_prev} = $self->{line};
2306     $self->{column_prev} = $self->{column};
2307     $self->{column}++;
2308     $self->{nc}
2309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2310     } else {
2311     $self->{set_nc}->($self);
2312     }
2313    
2314     redo A;
2315 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2316     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2317     $self->{is_xml}) and
2318 wakaba 1.1 $self->{nc} == 0x005B) { # [
2319    
2320     $self->{state} = MD_CDATA_STATE;
2321 wakaba 1.12 $self->{kwd} = '[';
2322 wakaba 1.1
2323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2324     $self->{line_prev} = $self->{line};
2325     $self->{column_prev} = $self->{column};
2326     $self->{column}++;
2327     $self->{nc}
2328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2329     } else {
2330     $self->{set_nc}->($self);
2331     }
2332    
2333     redo A;
2334     } else {
2335    
2336     }
2337    
2338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2339     line => $self->{line_prev},
2340     column => $self->{column_prev} - 1);
2341     ## Reconsume.
2342     $self->{state} = BOGUS_COMMENT_STATE;
2343     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2344     line => $self->{line_prev},
2345     column => $self->{column_prev} - 1,
2346     };
2347     redo A;
2348     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2349     if ($self->{nc} == 0x002D) { # -
2350    
2351     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2352     line => $self->{line_prev},
2353     column => $self->{column_prev} - 2,
2354     };
2355 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2356 wakaba 1.1
2357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2358     $self->{line_prev} = $self->{line};
2359     $self->{column_prev} = $self->{column};
2360     $self->{column}++;
2361     $self->{nc}
2362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2363     } else {
2364     $self->{set_nc}->($self);
2365     }
2366    
2367     redo A;
2368     } else {
2369    
2370     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2371     line => $self->{line_prev},
2372     column => $self->{column_prev} - 2);
2373     $self->{state} = BOGUS_COMMENT_STATE;
2374     ## Reconsume.
2375     $self->{ct} = {type => COMMENT_TOKEN,
2376     data => '-',
2377     line => $self->{line_prev},
2378     column => $self->{column_prev} - 2,
2379     };
2380     redo A;
2381     }
2382     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2383     ## ASCII case-insensitive.
2384     if ($self->{nc} == [
2385     undef,
2386     0x004F, # O
2387     0x0043, # C
2388     0x0054, # T
2389     0x0059, # Y
2390     0x0050, # P
2391 wakaba 1.12 ]->[length $self->{kwd}] or
2392 wakaba 1.1 $self->{nc} == [
2393     undef,
2394     0x006F, # o
2395     0x0063, # c
2396     0x0074, # t
2397     0x0079, # y
2398     0x0070, # p
2399 wakaba 1.12 ]->[length $self->{kwd}]) {
2400 wakaba 1.1
2401     ## Stay in the state.
2402 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2403 wakaba 1.1
2404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2405     $self->{line_prev} = $self->{line};
2406     $self->{column_prev} = $self->{column};
2407     $self->{column}++;
2408     $self->{nc}
2409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2410     } else {
2411     $self->{set_nc}->($self);
2412     }
2413    
2414     redo A;
2415 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2416 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2417     $self->{nc} == 0x0065)) { # e
2418 wakaba 1.12 if ($self->{is_xml} and
2419     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2420 wakaba 1.10
2421     ## XML5: case-sensitive.
2422     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2423     text => 'DOCTYPE',
2424     line => $self->{line_prev},
2425     column => $self->{column_prev} - 5);
2426     } else {
2427    
2428     }
2429 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2430     $self->{ct} = {type => DOCTYPE_TOKEN,
2431     quirks => 1,
2432     line => $self->{line_prev},
2433     column => $self->{column_prev} - 7,
2434     };
2435    
2436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2437     $self->{line_prev} = $self->{line};
2438     $self->{column_prev} = $self->{column};
2439     $self->{column}++;
2440     $self->{nc}
2441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2442     } else {
2443     $self->{set_nc}->($self);
2444     }
2445    
2446     redo A;
2447     } else {
2448    
2449     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2450     line => $self->{line_prev},
2451 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2452 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2453     ## Reconsume.
2454     $self->{ct} = {type => COMMENT_TOKEN,
2455 wakaba 1.12 data => $self->{kwd},
2456 wakaba 1.1 line => $self->{line_prev},
2457 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2458 wakaba 1.1 };
2459     redo A;
2460     }
2461     } elsif ($self->{state} == MD_CDATA_STATE) {
2462     if ($self->{nc} == {
2463     '[' => 0x0043, # C
2464     '[C' => 0x0044, # D
2465     '[CD' => 0x0041, # A
2466     '[CDA' => 0x0054, # T
2467     '[CDAT' => 0x0041, # A
2468 wakaba 1.12 }->{$self->{kwd}}) {
2469 wakaba 1.1
2470     ## Stay in the state.
2471 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2472 wakaba 1.1
2473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2474     $self->{line_prev} = $self->{line};
2475     $self->{column_prev} = $self->{column};
2476     $self->{column}++;
2477     $self->{nc}
2478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2479     } else {
2480     $self->{set_nc}->($self);
2481     }
2482    
2483     redo A;
2484 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2485 wakaba 1.1 $self->{nc} == 0x005B) { # [
2486 wakaba 1.6 if ($self->{is_xml} and
2487     not $self->{tainted} and
2488     @{$self->{open_elements} or []} == 0) {
2489 wakaba 1.8
2490 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2491     line => $self->{line_prev},
2492     column => $self->{column_prev} - 7);
2493     $self->{tainted} = 1;
2494 wakaba 1.8 } else {
2495    
2496 wakaba 1.6 }
2497    
2498 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2499     data => '',
2500     line => $self->{line_prev},
2501     column => $self->{column_prev} - 7};
2502     $self->{state} = CDATA_SECTION_STATE;
2503    
2504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2505     $self->{line_prev} = $self->{line};
2506     $self->{column_prev} = $self->{column};
2507     $self->{column}++;
2508     $self->{nc}
2509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2510     } else {
2511     $self->{set_nc}->($self);
2512     }
2513    
2514     redo A;
2515     } else {
2516    
2517     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2518     line => $self->{line_prev},
2519 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2520 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2521     ## Reconsume.
2522     $self->{ct} = {type => COMMENT_TOKEN,
2523 wakaba 1.12 data => $self->{kwd},
2524 wakaba 1.1 line => $self->{line_prev},
2525 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2526 wakaba 1.1 };
2527     redo A;
2528     }
2529     } elsif ($self->{state} == COMMENT_START_STATE) {
2530     if ($self->{nc} == 0x002D) { # -
2531    
2532     $self->{state} = COMMENT_START_DASH_STATE;
2533    
2534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2535     $self->{line_prev} = $self->{line};
2536     $self->{column_prev} = $self->{column};
2537     $self->{column}++;
2538     $self->{nc}
2539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2540     } else {
2541     $self->{set_nc}->($self);
2542     }
2543    
2544     redo A;
2545     } elsif ($self->{nc} == 0x003E) { # >
2546     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2547 wakaba 1.13 if ($self->{in_subset}) {
2548    
2549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2550     } else {
2551    
2552     $self->{state} = DATA_STATE;
2553     $self->{s_kwd} = '';
2554     }
2555 wakaba 1.1
2556     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2557     $self->{line_prev} = $self->{line};
2558     $self->{column_prev} = $self->{column};
2559     $self->{column}++;
2560     $self->{nc}
2561     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2562     } else {
2563     $self->{set_nc}->($self);
2564     }
2565    
2566    
2567     return ($self->{ct}); # comment
2568    
2569     redo A;
2570     } elsif ($self->{nc} == -1) {
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2572 wakaba 1.13 if ($self->{in_subset}) {
2573    
2574     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2575     } else {
2576    
2577     $self->{state} = DATA_STATE;
2578     $self->{s_kwd} = '';
2579     }
2580 wakaba 1.1 ## reconsume
2581    
2582     return ($self->{ct}); # comment
2583    
2584     redo A;
2585     } else {
2586    
2587     $self->{ct}->{data} # comment
2588     .= chr ($self->{nc});
2589     $self->{state} = COMMENT_STATE;
2590    
2591     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2592     $self->{line_prev} = $self->{line};
2593     $self->{column_prev} = $self->{column};
2594     $self->{column}++;
2595     $self->{nc}
2596     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2597     } else {
2598     $self->{set_nc}->($self);
2599     }
2600    
2601     redo A;
2602     }
2603     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2604     if ($self->{nc} == 0x002D) { # -
2605    
2606     $self->{state} = COMMENT_END_STATE;
2607    
2608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2609     $self->{line_prev} = $self->{line};
2610     $self->{column_prev} = $self->{column};
2611     $self->{column}++;
2612     $self->{nc}
2613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2614     } else {
2615     $self->{set_nc}->($self);
2616     }
2617    
2618     redo A;
2619     } elsif ($self->{nc} == 0x003E) { # >
2620     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2621 wakaba 1.13 if ($self->{in_subset}) {
2622    
2623     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2624     } else {
2625    
2626     $self->{state} = DATA_STATE;
2627     $self->{s_kwd} = '';
2628     }
2629 wakaba 1.1
2630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631     $self->{line_prev} = $self->{line};
2632     $self->{column_prev} = $self->{column};
2633     $self->{column}++;
2634     $self->{nc}
2635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2636     } else {
2637     $self->{set_nc}->($self);
2638     }
2639    
2640    
2641     return ($self->{ct}); # comment
2642    
2643     redo A;
2644     } elsif ($self->{nc} == -1) {
2645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2646 wakaba 1.13 if ($self->{in_subset}) {
2647    
2648     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2649     } else {
2650    
2651     $self->{state} = DATA_STATE;
2652     $self->{s_kwd} = '';
2653     }
2654 wakaba 1.1 ## reconsume
2655    
2656     return ($self->{ct}); # comment
2657    
2658     redo A;
2659     } else {
2660    
2661     $self->{ct}->{data} # comment
2662     .= '-' . chr ($self->{nc});
2663     $self->{state} = COMMENT_STATE;
2664    
2665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2666     $self->{line_prev} = $self->{line};
2667     $self->{column_prev} = $self->{column};
2668     $self->{column}++;
2669     $self->{nc}
2670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2671     } else {
2672     $self->{set_nc}->($self);
2673     }
2674    
2675     redo A;
2676     }
2677     } elsif ($self->{state} == COMMENT_STATE) {
2678 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2679    
2680 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2681    
2682     $self->{state} = COMMENT_END_DASH_STATE;
2683    
2684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2685     $self->{line_prev} = $self->{line};
2686     $self->{column_prev} = $self->{column};
2687     $self->{column}++;
2688     $self->{nc}
2689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2690     } else {
2691     $self->{set_nc}->($self);
2692     }
2693    
2694     redo A;
2695     } elsif ($self->{nc} == -1) {
2696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2697 wakaba 1.13 if ($self->{in_subset}) {
2698    
2699     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2700     } else {
2701    
2702     $self->{state} = DATA_STATE;
2703     $self->{s_kwd} = '';
2704     }
2705 wakaba 1.1 ## reconsume
2706    
2707     return ($self->{ct}); # comment
2708    
2709     redo A;
2710     } else {
2711    
2712     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2713     $self->{read_until}->($self->{ct}->{data},
2714     q[-],
2715     length $self->{ct}->{data});
2716    
2717     ## Stay in the state
2718    
2719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2720     $self->{line_prev} = $self->{line};
2721     $self->{column_prev} = $self->{column};
2722     $self->{column}++;
2723     $self->{nc}
2724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2725     } else {
2726     $self->{set_nc}->($self);
2727     }
2728    
2729     redo A;
2730     }
2731     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2732 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2733 wakaba 1.10
2734 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2735    
2736     $self->{state} = COMMENT_END_STATE;
2737    
2738     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2739     $self->{line_prev} = $self->{line};
2740     $self->{column_prev} = $self->{column};
2741     $self->{column}++;
2742     $self->{nc}
2743     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2744     } else {
2745     $self->{set_nc}->($self);
2746     }
2747    
2748     redo A;
2749     } elsif ($self->{nc} == -1) {
2750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2751 wakaba 1.13 if ($self->{in_subset}) {
2752    
2753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2754     } else {
2755    
2756     $self->{state} = DATA_STATE;
2757     $self->{s_kwd} = '';
2758     }
2759 wakaba 1.1 ## reconsume
2760    
2761     return ($self->{ct}); # comment
2762    
2763     redo A;
2764     } else {
2765    
2766     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2767     $self->{state} = COMMENT_STATE;
2768    
2769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2770     $self->{line_prev} = $self->{line};
2771     $self->{column_prev} = $self->{column};
2772     $self->{column}++;
2773     $self->{nc}
2774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2775     } else {
2776     $self->{set_nc}->($self);
2777     }
2778    
2779     redo A;
2780     }
2781     } elsif ($self->{state} == COMMENT_END_STATE) {
2782 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2783    
2784 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2785 wakaba 1.13 if ($self->{in_subset}) {
2786    
2787     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2788     } else {
2789    
2790     $self->{state} = DATA_STATE;
2791     $self->{s_kwd} = '';
2792     }
2793 wakaba 1.1
2794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2795     $self->{line_prev} = $self->{line};
2796     $self->{column_prev} = $self->{column};
2797     $self->{column}++;
2798     $self->{nc}
2799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2800     } else {
2801     $self->{set_nc}->($self);
2802     }
2803    
2804    
2805     return ($self->{ct}); # comment
2806    
2807     redo A;
2808     } elsif ($self->{nc} == 0x002D) { # -
2809    
2810 wakaba 1.10 ## XML5: Not a parse error.
2811 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2812     line => $self->{line_prev},
2813     column => $self->{column_prev});
2814     $self->{ct}->{data} .= '-'; # comment
2815     ## Stay in the state
2816    
2817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2818     $self->{line_prev} = $self->{line};
2819     $self->{column_prev} = $self->{column};
2820     $self->{column}++;
2821     $self->{nc}
2822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2823     } else {
2824     $self->{set_nc}->($self);
2825     }
2826    
2827     redo A;
2828     } elsif ($self->{nc} == -1) {
2829     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2830 wakaba 1.13 if ($self->{in_subset}) {
2831    
2832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2833     } else {
2834    
2835     $self->{state} = DATA_STATE;
2836     $self->{s_kwd} = '';
2837     }
2838 wakaba 1.1 ## reconsume
2839    
2840     return ($self->{ct}); # comment
2841    
2842     redo A;
2843     } else {
2844    
2845 wakaba 1.10 ## XML5: Not a parse error.
2846 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2847     line => $self->{line_prev},
2848     column => $self->{column_prev});
2849     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2850     $self->{state} = COMMENT_STATE;
2851    
2852     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2853     $self->{line_prev} = $self->{line};
2854     $self->{column_prev} = $self->{column};
2855     $self->{column}++;
2856     $self->{nc}
2857     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2858     } else {
2859     $self->{set_nc}->($self);
2860     }
2861    
2862     redo A;
2863     }
2864     } elsif ($self->{state} == DOCTYPE_STATE) {
2865     if ($is_space->{$self->{nc}}) {
2866    
2867     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2868    
2869     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2870     $self->{line_prev} = $self->{line};
2871     $self->{column_prev} = $self->{column};
2872     $self->{column}++;
2873     $self->{nc}
2874     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2875     } else {
2876     $self->{set_nc}->($self);
2877     }
2878    
2879     redo A;
2880     } else {
2881    
2882 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2883 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2884     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2885     ## reconsume
2886     redo A;
2887     }
2888     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2889 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2890    
2891 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2892    
2893     ## Stay in the state
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == 0x003E) { # >
2907    
2908 wakaba 1.12 ## XML5: No parse error.
2909 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2910     $self->{state} = DATA_STATE;
2911 wakaba 1.5 $self->{s_kwd} = '';
2912 wakaba 1.1
2913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914     $self->{line_prev} = $self->{line};
2915     $self->{column_prev} = $self->{column};
2916     $self->{column}++;
2917     $self->{nc}
2918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2919     } else {
2920     $self->{set_nc}->($self);
2921     }
2922    
2923    
2924     return ($self->{ct}); # DOCTYPE (quirks)
2925    
2926     redo A;
2927     } elsif ($self->{nc} == -1) {
2928    
2929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2930     $self->{state} = DATA_STATE;
2931 wakaba 1.5 $self->{s_kwd} = '';
2932 wakaba 1.1 ## reconsume
2933    
2934     return ($self->{ct}); # DOCTYPE (quirks)
2935    
2936     redo A;
2937 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2938    
2939     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2940     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2942     $self->{in_subset} = 1;
2943 wakaba 1.12
2944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2945     $self->{line_prev} = $self->{line};
2946     $self->{column_prev} = $self->{column};
2947     $self->{column}++;
2948     $self->{nc}
2949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2950     } else {
2951     $self->{set_nc}->($self);
2952     }
2953    
2954 wakaba 1.13 return ($self->{ct}); # DOCTYPE
2955 wakaba 1.12 redo A;
2956 wakaba 1.1 } else {
2957    
2958     $self->{ct}->{name} = chr $self->{nc};
2959     delete $self->{ct}->{quirks};
2960     $self->{state} = DOCTYPE_NAME_STATE;
2961    
2962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2963     $self->{line_prev} = $self->{line};
2964     $self->{column_prev} = $self->{column};
2965     $self->{column}++;
2966     $self->{nc}
2967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2968     } else {
2969     $self->{set_nc}->($self);
2970     }
2971    
2972     redo A;
2973     }
2974     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2975 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2976    
2977     ## ISSUE: Redundant "First," in the spec.
2978    
2979 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2980    
2981     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2982    
2983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2984     $self->{line_prev} = $self->{line};
2985     $self->{column_prev} = $self->{column};
2986     $self->{column}++;
2987     $self->{nc}
2988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2989     } else {
2990     $self->{set_nc}->($self);
2991     }
2992    
2993     redo A;
2994     } elsif ($self->{nc} == 0x003E) { # >
2995    
2996     $self->{state} = DATA_STATE;
2997 wakaba 1.5 $self->{s_kwd} = '';
2998 wakaba 1.1
2999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000     $self->{line_prev} = $self->{line};
3001     $self->{column_prev} = $self->{column};
3002     $self->{column}++;
3003     $self->{nc}
3004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3005     } else {
3006     $self->{set_nc}->($self);
3007     }
3008    
3009    
3010     return ($self->{ct}); # DOCTYPE
3011    
3012     redo A;
3013     } elsif ($self->{nc} == -1) {
3014    
3015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3016     $self->{state} = DATA_STATE;
3017 wakaba 1.5 $self->{s_kwd} = '';
3018 wakaba 1.1 ## reconsume
3019    
3020     $self->{ct}->{quirks} = 1;
3021     return ($self->{ct}); # DOCTYPE
3022    
3023     redo A;
3024 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3025    
3026     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3027 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3028     $self->{in_subset} = 1;
3029 wakaba 1.12
3030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3031     $self->{line_prev} = $self->{line};
3032     $self->{column_prev} = $self->{column};
3033     $self->{column}++;
3034     $self->{nc}
3035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3036     } else {
3037     $self->{set_nc}->($self);
3038     }
3039    
3040 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3041 wakaba 1.12 redo A;
3042 wakaba 1.1 } else {
3043    
3044     $self->{ct}->{name}
3045     .= chr ($self->{nc}); # DOCTYPE
3046     ## Stay in the state
3047    
3048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3049     $self->{line_prev} = $self->{line};
3050     $self->{column_prev} = $self->{column};
3051     $self->{column}++;
3052     $self->{nc}
3053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3054     } else {
3055     $self->{set_nc}->($self);
3056     }
3057    
3058     redo A;
3059     }
3060     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3061 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3062     ## state", but implemented differently.
3063    
3064 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3065    
3066     ## Stay in the state
3067    
3068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3069     $self->{line_prev} = $self->{line};
3070     $self->{column_prev} = $self->{column};
3071     $self->{column}++;
3072     $self->{nc}
3073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3074     } else {
3075     $self->{set_nc}->($self);
3076     }
3077    
3078     redo A;
3079     } elsif ($self->{nc} == 0x003E) { # >
3080    
3081     $self->{state} = DATA_STATE;
3082 wakaba 1.5 $self->{s_kwd} = '';
3083 wakaba 1.1
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094    
3095     return ($self->{ct}); # DOCTYPE
3096    
3097     redo A;
3098     } elsif ($self->{nc} == -1) {
3099    
3100     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3101     $self->{state} = DATA_STATE;
3102 wakaba 1.5 $self->{s_kwd} = '';
3103 wakaba 1.1 ## reconsume
3104    
3105     $self->{ct}->{quirks} = 1;
3106     return ($self->{ct}); # DOCTYPE
3107    
3108     redo A;
3109     } elsif ($self->{nc} == 0x0050 or # P
3110     $self->{nc} == 0x0070) { # p
3111 wakaba 1.12
3112 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3113 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3114 wakaba 1.1
3115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3116     $self->{line_prev} = $self->{line};
3117     $self->{column_prev} = $self->{column};
3118     $self->{column}++;
3119     $self->{nc}
3120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3121     } else {
3122     $self->{set_nc}->($self);
3123     }
3124    
3125     redo A;
3126     } elsif ($self->{nc} == 0x0053 or # S
3127     $self->{nc} == 0x0073) { # s
3128 wakaba 1.12
3129 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3130 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3131    
3132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3133     $self->{line_prev} = $self->{line};
3134     $self->{column_prev} = $self->{column};
3135     $self->{column}++;
3136     $self->{nc}
3137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3138     } else {
3139     $self->{set_nc}->($self);
3140     }
3141    
3142     redo A;
3143     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3144    
3145     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3146     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3147 wakaba 1.13 $self->{in_subset} = 1;
3148 wakaba 1.1
3149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3150     $self->{line_prev} = $self->{line};
3151     $self->{column_prev} = $self->{column};
3152     $self->{column}++;
3153     $self->{nc}
3154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3155     } else {
3156     $self->{set_nc}->($self);
3157     }
3158    
3159 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3160 wakaba 1.1 redo A;
3161     } else {
3162    
3163     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
3164     $self->{ct}->{quirks} = 1;
3165    
3166     $self->{state} = BOGUS_DOCTYPE_STATE;
3167    
3168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3169     $self->{line_prev} = $self->{line};
3170     $self->{column_prev} = $self->{column};
3171     $self->{column}++;
3172     $self->{nc}
3173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3174     } else {
3175     $self->{set_nc}->($self);
3176     }
3177    
3178     redo A;
3179     }
3180     } elsif ($self->{state} == PUBLIC_STATE) {
3181     ## ASCII case-insensitive
3182     if ($self->{nc} == [
3183     undef,
3184     0x0055, # U
3185     0x0042, # B
3186     0x004C, # L
3187     0x0049, # I
3188 wakaba 1.12 ]->[length $self->{kwd}] or
3189 wakaba 1.1 $self->{nc} == [
3190     undef,
3191     0x0075, # u
3192     0x0062, # b
3193     0x006C, # l
3194     0x0069, # i
3195 wakaba 1.12 ]->[length $self->{kwd}]) {
3196 wakaba 1.1
3197     ## Stay in the state.
3198 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3199 wakaba 1.1
3200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3201     $self->{line_prev} = $self->{line};
3202     $self->{column_prev} = $self->{column};
3203     $self->{column}++;
3204     $self->{nc}
3205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3206     } else {
3207     $self->{set_nc}->($self);
3208     }
3209    
3210     redo A;
3211 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3212 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3213     $self->{nc} == 0x0063)) { # c
3214 wakaba 1.12 if ($self->{is_xml} and
3215     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3216    
3217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3218     text => 'PUBLIC',
3219     line => $self->{line_prev},
3220     column => $self->{column_prev} - 4);
3221     } else {
3222    
3223     }
3224 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3225    
3226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3227     $self->{line_prev} = $self->{line};
3228     $self->{column_prev} = $self->{column};
3229     $self->{column}++;
3230     $self->{nc}
3231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3232     } else {
3233     $self->{set_nc}->($self);
3234     }
3235    
3236     redo A;
3237     } else {
3238    
3239     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3240     line => $self->{line_prev},
3241 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3242 wakaba 1.1 $self->{ct}->{quirks} = 1;
3243    
3244     $self->{state} = BOGUS_DOCTYPE_STATE;
3245     ## Reconsume.
3246     redo A;
3247     }
3248     } elsif ($self->{state} == SYSTEM_STATE) {
3249     ## ASCII case-insensitive
3250     if ($self->{nc} == [
3251     undef,
3252     0x0059, # Y
3253     0x0053, # S
3254     0x0054, # T
3255     0x0045, # E
3256 wakaba 1.12 ]->[length $self->{kwd}] or
3257 wakaba 1.1 $self->{nc} == [
3258     undef,
3259     0x0079, # y
3260     0x0073, # s
3261     0x0074, # t
3262     0x0065, # e
3263 wakaba 1.12 ]->[length $self->{kwd}]) {
3264 wakaba 1.1
3265     ## Stay in the state.
3266 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3267 wakaba 1.1
3268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269     $self->{line_prev} = $self->{line};
3270     $self->{column_prev} = $self->{column};
3271     $self->{column}++;
3272     $self->{nc}
3273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274     } else {
3275     $self->{set_nc}->($self);
3276     }
3277    
3278     redo A;
3279 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3280 wakaba 1.1 ($self->{nc} == 0x004D or # M
3281     $self->{nc} == 0x006D)) { # m
3282 wakaba 1.12 if ($self->{is_xml} and
3283     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3284    
3285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3286     text => 'SYSTEM',
3287     line => $self->{line_prev},
3288     column => $self->{column_prev} - 4);
3289     } else {
3290    
3291     }
3292 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3293    
3294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3295     $self->{line_prev} = $self->{line};
3296     $self->{column_prev} = $self->{column};
3297     $self->{column}++;
3298     $self->{nc}
3299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3300     } else {
3301     $self->{set_nc}->($self);
3302     }
3303    
3304     redo A;
3305     } else {
3306    
3307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3308     line => $self->{line_prev},
3309 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3310 wakaba 1.1 $self->{ct}->{quirks} = 1;
3311    
3312     $self->{state} = BOGUS_DOCTYPE_STATE;
3313     ## Reconsume.
3314     redo A;
3315     }
3316     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3317     if ($is_space->{$self->{nc}}) {
3318    
3319     ## Stay in the state
3320    
3321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3322     $self->{line_prev} = $self->{line};
3323     $self->{column_prev} = $self->{column};
3324     $self->{column}++;
3325     $self->{nc}
3326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3327     } else {
3328     $self->{set_nc}->($self);
3329     }
3330    
3331     redo A;
3332     } elsif ($self->{nc} eq 0x0022) { # "
3333    
3334     $self->{ct}->{pubid} = ''; # DOCTYPE
3335     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3336    
3337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3338     $self->{line_prev} = $self->{line};
3339     $self->{column_prev} = $self->{column};
3340     $self->{column}++;
3341     $self->{nc}
3342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3343     } else {
3344     $self->{set_nc}->($self);
3345     }
3346    
3347     redo A;
3348     } elsif ($self->{nc} eq 0x0027) { # '
3349    
3350     $self->{ct}->{pubid} = ''; # DOCTYPE
3351     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3352    
3353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3354     $self->{line_prev} = $self->{line};
3355     $self->{column_prev} = $self->{column};
3356     $self->{column}++;
3357     $self->{nc}
3358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3359     } else {
3360     $self->{set_nc}->($self);
3361     }
3362    
3363     redo A;
3364     } elsif ($self->{nc} eq 0x003E) { # >
3365    
3366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3367    
3368     $self->{state} = DATA_STATE;
3369 wakaba 1.5 $self->{s_kwd} = '';
3370 wakaba 1.1
3371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3372     $self->{line_prev} = $self->{line};
3373     $self->{column_prev} = $self->{column};
3374     $self->{column}++;
3375     $self->{nc}
3376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3377     } else {
3378     $self->{set_nc}->($self);
3379     }
3380    
3381    
3382     $self->{ct}->{quirks} = 1;
3383     return ($self->{ct}); # DOCTYPE
3384    
3385     redo A;
3386     } elsif ($self->{nc} == -1) {
3387    
3388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3389    
3390     $self->{state} = DATA_STATE;
3391 wakaba 1.5 $self->{s_kwd} = '';
3392 wakaba 1.1 ## reconsume
3393    
3394     $self->{ct}->{quirks} = 1;
3395     return ($self->{ct}); # DOCTYPE
3396    
3397     redo A;
3398 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3399    
3400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3402     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3403 wakaba 1.13 $self->{in_subset} = 1;
3404 wakaba 1.12
3405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3406     $self->{line_prev} = $self->{line};
3407     $self->{column_prev} = $self->{column};
3408     $self->{column}++;
3409     $self->{nc}
3410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3411     } else {
3412     $self->{set_nc}->($self);
3413     }
3414    
3415 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3416 wakaba 1.12 redo A;
3417 wakaba 1.1 } else {
3418    
3419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3420     $self->{ct}->{quirks} = 1;
3421    
3422     $self->{state} = BOGUS_DOCTYPE_STATE;
3423    
3424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3425     $self->{line_prev} = $self->{line};
3426     $self->{column_prev} = $self->{column};
3427     $self->{column}++;
3428     $self->{nc}
3429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3430     } else {
3431     $self->{set_nc}->($self);
3432     }
3433    
3434     redo A;
3435     }
3436     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3437     if ($self->{nc} == 0x0022) { # "
3438    
3439     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3440    
3441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3442     $self->{line_prev} = $self->{line};
3443     $self->{column_prev} = $self->{column};
3444     $self->{column}++;
3445     $self->{nc}
3446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3447     } else {
3448     $self->{set_nc}->($self);
3449     }
3450    
3451     redo A;
3452     } elsif ($self->{nc} == 0x003E) { # >
3453    
3454     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3455    
3456     $self->{state} = DATA_STATE;
3457 wakaba 1.5 $self->{s_kwd} = '';
3458 wakaba 1.1
3459     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3460     $self->{line_prev} = $self->{line};
3461     $self->{column_prev} = $self->{column};
3462     $self->{column}++;
3463     $self->{nc}
3464     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3465     } else {
3466     $self->{set_nc}->($self);
3467     }
3468    
3469    
3470     $self->{ct}->{quirks} = 1;
3471     return ($self->{ct}); # DOCTYPE
3472    
3473     redo A;
3474     } elsif ($self->{nc} == -1) {
3475    
3476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3477    
3478     $self->{state} = DATA_STATE;
3479 wakaba 1.5 $self->{s_kwd} = '';
3480 wakaba 1.1 ## reconsume
3481    
3482     $self->{ct}->{quirks} = 1;
3483     return ($self->{ct}); # DOCTYPE
3484    
3485     redo A;
3486     } else {
3487    
3488     $self->{ct}->{pubid} # DOCTYPE
3489     .= chr $self->{nc};
3490     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3491     length $self->{ct}->{pubid});
3492    
3493     ## Stay in the state
3494    
3495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3496     $self->{line_prev} = $self->{line};
3497     $self->{column_prev} = $self->{column};
3498     $self->{column}++;
3499     $self->{nc}
3500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3501     } else {
3502     $self->{set_nc}->($self);
3503     }
3504    
3505     redo A;
3506     }
3507     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3508     if ($self->{nc} == 0x0027) { # '
3509    
3510     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3511    
3512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3513     $self->{line_prev} = $self->{line};
3514     $self->{column_prev} = $self->{column};
3515     $self->{column}++;
3516     $self->{nc}
3517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3518     } else {
3519     $self->{set_nc}->($self);
3520     }
3521    
3522     redo A;
3523     } elsif ($self->{nc} == 0x003E) { # >
3524    
3525     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3526    
3527     $self->{state} = DATA_STATE;
3528 wakaba 1.5 $self->{s_kwd} = '';
3529 wakaba 1.1
3530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3531     $self->{line_prev} = $self->{line};
3532     $self->{column_prev} = $self->{column};
3533     $self->{column}++;
3534     $self->{nc}
3535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3536     } else {
3537     $self->{set_nc}->($self);
3538     }
3539    
3540    
3541     $self->{ct}->{quirks} = 1;
3542     return ($self->{ct}); # DOCTYPE
3543    
3544     redo A;
3545     } elsif ($self->{nc} == -1) {
3546    
3547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3548    
3549     $self->{state} = DATA_STATE;
3550 wakaba 1.5 $self->{s_kwd} = '';
3551 wakaba 1.1 ## reconsume
3552    
3553     $self->{ct}->{quirks} = 1;
3554     return ($self->{ct}); # DOCTYPE
3555    
3556     redo A;
3557     } else {
3558    
3559     $self->{ct}->{pubid} # DOCTYPE
3560     .= chr $self->{nc};
3561     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3562     length $self->{ct}->{pubid});
3563    
3564     ## Stay in the state
3565    
3566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3567     $self->{line_prev} = $self->{line};
3568     $self->{column_prev} = $self->{column};
3569     $self->{column}++;
3570     $self->{nc}
3571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3572     } else {
3573     $self->{set_nc}->($self);
3574     }
3575    
3576     redo A;
3577     }
3578     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3579     if ($is_space->{$self->{nc}}) {
3580    
3581     ## Stay in the state
3582    
3583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3584     $self->{line_prev} = $self->{line};
3585     $self->{column_prev} = $self->{column};
3586     $self->{column}++;
3587     $self->{nc}
3588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3589     } else {
3590     $self->{set_nc}->($self);
3591     }
3592    
3593     redo A;
3594     } elsif ($self->{nc} == 0x0022) { # "
3595    
3596     $self->{ct}->{sysid} = ''; # DOCTYPE
3597     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3598    
3599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600     $self->{line_prev} = $self->{line};
3601     $self->{column_prev} = $self->{column};
3602     $self->{column}++;
3603     $self->{nc}
3604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605     } else {
3606     $self->{set_nc}->($self);
3607     }
3608    
3609     redo A;
3610     } elsif ($self->{nc} == 0x0027) { # '
3611    
3612     $self->{ct}->{sysid} = ''; # DOCTYPE
3613     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3614    
3615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3616     $self->{line_prev} = $self->{line};
3617     $self->{column_prev} = $self->{column};
3618     $self->{column}++;
3619     $self->{nc}
3620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3621     } else {
3622     $self->{set_nc}->($self);
3623     }
3624    
3625     redo A;
3626     } elsif ($self->{nc} == 0x003E) { # >
3627 wakaba 1.12 if ($self->{is_xml}) {
3628    
3629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3630     } else {
3631    
3632     }
3633 wakaba 1.1 $self->{state} = DATA_STATE;
3634 wakaba 1.5 $self->{s_kwd} = '';
3635 wakaba 1.1
3636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3637     $self->{line_prev} = $self->{line};
3638     $self->{column_prev} = $self->{column};
3639     $self->{column}++;
3640     $self->{nc}
3641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3642     } else {
3643     $self->{set_nc}->($self);
3644     }
3645    
3646    
3647     return ($self->{ct}); # DOCTYPE
3648    
3649     redo A;
3650     } elsif ($self->{nc} == -1) {
3651    
3652     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3653    
3654     $self->{state} = DATA_STATE;
3655 wakaba 1.5 $self->{s_kwd} = '';
3656 wakaba 1.1 ## reconsume
3657    
3658     $self->{ct}->{quirks} = 1;
3659     return ($self->{ct}); # DOCTYPE
3660    
3661     redo A;
3662 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3663    
3664     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3665     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3666     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3667 wakaba 1.13 $self->{in_subset} = 1;
3668 wakaba 1.12
3669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3670     $self->{line_prev} = $self->{line};
3671     $self->{column_prev} = $self->{column};
3672     $self->{column}++;
3673     $self->{nc}
3674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3675     } else {
3676     $self->{set_nc}->($self);
3677     }
3678    
3679 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3680 wakaba 1.12 redo A;
3681 wakaba 1.1 } else {
3682    
3683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3684     $self->{ct}->{quirks} = 1;
3685    
3686     $self->{state} = BOGUS_DOCTYPE_STATE;
3687    
3688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3689     $self->{line_prev} = $self->{line};
3690     $self->{column_prev} = $self->{column};
3691     $self->{column}++;
3692     $self->{nc}
3693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3694     } else {
3695     $self->{set_nc}->($self);
3696     }
3697    
3698     redo A;
3699     }
3700     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3701     if ($is_space->{$self->{nc}}) {
3702    
3703     ## Stay in the state
3704    
3705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3706     $self->{line_prev} = $self->{line};
3707     $self->{column_prev} = $self->{column};
3708     $self->{column}++;
3709     $self->{nc}
3710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3711     } else {
3712     $self->{set_nc}->($self);
3713     }
3714    
3715     redo A;
3716     } elsif ($self->{nc} == 0x0022) { # "
3717    
3718     $self->{ct}->{sysid} = ''; # DOCTYPE
3719     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3720    
3721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3722     $self->{line_prev} = $self->{line};
3723     $self->{column_prev} = $self->{column};
3724     $self->{column}++;
3725     $self->{nc}
3726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3727     } else {
3728     $self->{set_nc}->($self);
3729     }
3730    
3731     redo A;
3732     } elsif ($self->{nc} == 0x0027) { # '
3733    
3734     $self->{ct}->{sysid} = ''; # DOCTYPE
3735     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3736    
3737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3738     $self->{line_prev} = $self->{line};
3739     $self->{column_prev} = $self->{column};
3740     $self->{column}++;
3741     $self->{nc}
3742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3743     } else {
3744     $self->{set_nc}->($self);
3745     }
3746    
3747     redo A;
3748     } elsif ($self->{nc} == 0x003E) { # >
3749    
3750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3751     $self->{state} = DATA_STATE;
3752 wakaba 1.5 $self->{s_kwd} = '';
3753 wakaba 1.1
3754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3755     $self->{line_prev} = $self->{line};
3756     $self->{column_prev} = $self->{column};
3757     $self->{column}++;
3758     $self->{nc}
3759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3760     } else {
3761     $self->{set_nc}->($self);
3762     }
3763    
3764    
3765     $self->{ct}->{quirks} = 1;
3766     return ($self->{ct}); # DOCTYPE
3767    
3768     redo A;
3769     } elsif ($self->{nc} == -1) {
3770    
3771     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3772    
3773     $self->{state} = DATA_STATE;
3774 wakaba 1.5 $self->{s_kwd} = '';
3775 wakaba 1.1 ## reconsume
3776    
3777     $self->{ct}->{quirks} = 1;
3778     return ($self->{ct}); # DOCTYPE
3779    
3780     redo A;
3781 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3782    
3783     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3784    
3785     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3786     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3787 wakaba 1.13 $self->{in_subset} = 1;
3788 wakaba 1.12
3789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3790     $self->{line_prev} = $self->{line};
3791     $self->{column_prev} = $self->{column};
3792     $self->{column}++;
3793     $self->{nc}
3794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3795     } else {
3796     $self->{set_nc}->($self);
3797     }
3798    
3799 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3800 wakaba 1.12 redo A;
3801 wakaba 1.1 } else {
3802    
3803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3804     $self->{ct}->{quirks} = 1;
3805    
3806     $self->{state} = BOGUS_DOCTYPE_STATE;
3807    
3808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3809     $self->{line_prev} = $self->{line};
3810     $self->{column_prev} = $self->{column};
3811     $self->{column}++;
3812     $self->{nc}
3813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3814     } else {
3815     $self->{set_nc}->($self);
3816     }
3817    
3818     redo A;
3819     }
3820     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3821     if ($self->{nc} == 0x0022) { # "
3822    
3823     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3824    
3825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3826     $self->{line_prev} = $self->{line};
3827     $self->{column_prev} = $self->{column};
3828     $self->{column}++;
3829     $self->{nc}
3830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3831     } else {
3832     $self->{set_nc}->($self);
3833     }
3834    
3835     redo A;
3836 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3837 wakaba 1.1
3838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3839    
3840     $self->{state} = DATA_STATE;
3841 wakaba 1.5 $self->{s_kwd} = '';
3842 wakaba 1.1
3843     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3844     $self->{line_prev} = $self->{line};
3845     $self->{column_prev} = $self->{column};
3846     $self->{column}++;
3847     $self->{nc}
3848     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3849     } else {
3850     $self->{set_nc}->($self);
3851     }
3852    
3853    
3854     $self->{ct}->{quirks} = 1;
3855     return ($self->{ct}); # DOCTYPE
3856    
3857     redo A;
3858     } elsif ($self->{nc} == -1) {
3859    
3860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3861    
3862     $self->{state} = DATA_STATE;
3863 wakaba 1.5 $self->{s_kwd} = '';
3864 wakaba 1.1 ## reconsume
3865    
3866     $self->{ct}->{quirks} = 1;
3867     return ($self->{ct}); # DOCTYPE
3868    
3869     redo A;
3870     } else {
3871    
3872     $self->{ct}->{sysid} # DOCTYPE
3873     .= chr $self->{nc};
3874     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3875     length $self->{ct}->{sysid});
3876    
3877     ## Stay in the state
3878    
3879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3880     $self->{line_prev} = $self->{line};
3881     $self->{column_prev} = $self->{column};
3882     $self->{column}++;
3883     $self->{nc}
3884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3885     } else {
3886     $self->{set_nc}->($self);
3887     }
3888    
3889     redo A;
3890     }
3891     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3892     if ($self->{nc} == 0x0027) { # '
3893    
3894     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3895    
3896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3897     $self->{line_prev} = $self->{line};
3898     $self->{column_prev} = $self->{column};
3899     $self->{column}++;
3900     $self->{nc}
3901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3902     } else {
3903     $self->{set_nc}->($self);
3904     }
3905    
3906     redo A;
3907 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3908 wakaba 1.1
3909     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3910    
3911     $self->{state} = DATA_STATE;
3912 wakaba 1.5 $self->{s_kwd} = '';
3913 wakaba 1.1
3914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915     $self->{line_prev} = $self->{line};
3916     $self->{column_prev} = $self->{column};
3917     $self->{column}++;
3918     $self->{nc}
3919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920     } else {
3921     $self->{set_nc}->($self);
3922     }
3923    
3924    
3925     $self->{ct}->{quirks} = 1;
3926     return ($self->{ct}); # DOCTYPE
3927    
3928     redo A;
3929     } elsif ($self->{nc} == -1) {
3930    
3931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3932    
3933     $self->{state} = DATA_STATE;
3934 wakaba 1.5 $self->{s_kwd} = '';
3935 wakaba 1.1 ## reconsume
3936    
3937     $self->{ct}->{quirks} = 1;
3938     return ($self->{ct}); # DOCTYPE
3939    
3940     redo A;
3941     } else {
3942    
3943     $self->{ct}->{sysid} # DOCTYPE
3944     .= chr $self->{nc};
3945     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3946     length $self->{ct}->{sysid});
3947    
3948     ## Stay in the state
3949    
3950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3951     $self->{line_prev} = $self->{line};
3952     $self->{column_prev} = $self->{column};
3953     $self->{column}++;
3954     $self->{nc}
3955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3956     } else {
3957     $self->{set_nc}->($self);
3958     }
3959    
3960     redo A;
3961     }
3962     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3963     if ($is_space->{$self->{nc}}) {
3964    
3965     ## Stay in the state
3966    
3967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3968     $self->{line_prev} = $self->{line};
3969     $self->{column_prev} = $self->{column};
3970     $self->{column}++;
3971     $self->{nc}
3972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3973     } else {
3974     $self->{set_nc}->($self);
3975     }
3976    
3977     redo A;
3978     } elsif ($self->{nc} == 0x003E) { # >
3979    
3980     $self->{state} = DATA_STATE;
3981 wakaba 1.5 $self->{s_kwd} = '';
3982 wakaba 1.1
3983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984     $self->{line_prev} = $self->{line};
3985     $self->{column_prev} = $self->{column};
3986     $self->{column}++;
3987     $self->{nc}
3988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3989     } else {
3990     $self->{set_nc}->($self);
3991     }
3992    
3993    
3994     return ($self->{ct}); # DOCTYPE
3995    
3996     redo A;
3997     } elsif ($self->{nc} == -1) {
3998    
3999     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4000     $self->{state} = DATA_STATE;
4001 wakaba 1.5 $self->{s_kwd} = '';
4002 wakaba 1.1 ## reconsume
4003    
4004     $self->{ct}->{quirks} = 1;
4005     return ($self->{ct}); # DOCTYPE
4006    
4007     redo A;
4008 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4009    
4010     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4011     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4012 wakaba 1.13 $self->{in_subset} = 1;
4013 wakaba 1.12
4014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4015     $self->{line_prev} = $self->{line};
4016     $self->{column_prev} = $self->{column};
4017     $self->{column}++;
4018     $self->{nc}
4019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4020     } else {
4021     $self->{set_nc}->($self);
4022     }
4023    
4024 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4025 wakaba 1.12 redo A;
4026 wakaba 1.1 } else {
4027    
4028     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4029     #$self->{ct}->{quirks} = 1;
4030    
4031     $self->{state} = BOGUS_DOCTYPE_STATE;
4032    
4033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4034     $self->{line_prev} = $self->{line};
4035     $self->{column_prev} = $self->{column};
4036     $self->{column}++;
4037     $self->{nc}
4038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4039     } else {
4040     $self->{set_nc}->($self);
4041     }
4042    
4043     redo A;
4044     }
4045     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4046     if ($self->{nc} == 0x003E) { # >
4047    
4048     $self->{state} = DATA_STATE;
4049 wakaba 1.5 $self->{s_kwd} = '';
4050 wakaba 1.1
4051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4052     $self->{line_prev} = $self->{line};
4053     $self->{column_prev} = $self->{column};
4054     $self->{column}++;
4055     $self->{nc}
4056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4057     } else {
4058     $self->{set_nc}->($self);
4059     }
4060    
4061    
4062     return ($self->{ct}); # DOCTYPE
4063    
4064     redo A;
4065 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4066 wakaba 1.13
4067     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4068     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4069     $self->{in_subset} = 1;
4070    
4071 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4072     $self->{line_prev} = $self->{line};
4073     $self->{column_prev} = $self->{column};
4074     $self->{column}++;
4075     $self->{nc}
4076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4077     } else {
4078     $self->{set_nc}->($self);
4079     }
4080    
4081 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4082     redo A;
4083 wakaba 1.1 } elsif ($self->{nc} == -1) {
4084    
4085     $self->{state} = DATA_STATE;
4086 wakaba 1.5 $self->{s_kwd} = '';
4087 wakaba 1.1 ## reconsume
4088    
4089     return ($self->{ct}); # DOCTYPE
4090    
4091     redo A;
4092     } else {
4093    
4094     my $s = '';
4095 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4096 wakaba 1.1
4097     ## Stay in the state
4098    
4099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4100     $self->{line_prev} = $self->{line};
4101     $self->{column_prev} = $self->{column};
4102     $self->{column}++;
4103     $self->{nc}
4104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4105     } else {
4106     $self->{set_nc}->($self);
4107     }
4108    
4109     redo A;
4110     }
4111     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4112     ## NOTE: "CDATA section state" in the state is jointly implemented
4113     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4114     ## and |CDATA_SECTION_MSE2_STATE|.
4115 wakaba 1.10
4116     ## XML5: "CDATA state".
4117 wakaba 1.1
4118     if ($self->{nc} == 0x005D) { # ]
4119    
4120     $self->{state} = CDATA_SECTION_MSE1_STATE;
4121    
4122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123     $self->{line_prev} = $self->{line};
4124     $self->{column_prev} = $self->{column};
4125     $self->{column}++;
4126     $self->{nc}
4127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4128     } else {
4129     $self->{set_nc}->($self);
4130     }
4131    
4132     redo A;
4133     } elsif ($self->{nc} == -1) {
4134 wakaba 1.6 if ($self->{is_xml}) {
4135 wakaba 1.8
4136 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4137 wakaba 1.8 } else {
4138    
4139 wakaba 1.6 }
4140    
4141 wakaba 1.1 $self->{state} = DATA_STATE;
4142 wakaba 1.5 $self->{s_kwd} = '';
4143 wakaba 1.10 ## Reconsume.
4144 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4145    
4146     return ($self->{ct}); # character
4147     } else {
4148    
4149     ## No token to emit. $self->{ct} is discarded.
4150     }
4151     redo A;
4152     } else {
4153    
4154     $self->{ct}->{data} .= chr $self->{nc};
4155     $self->{read_until}->($self->{ct}->{data},
4156     q<]>,
4157     length $self->{ct}->{data});
4158    
4159     ## Stay in the state.
4160    
4161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4162     $self->{line_prev} = $self->{line};
4163     $self->{column_prev} = $self->{column};
4164     $self->{column}++;
4165     $self->{nc}
4166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4167     } else {
4168     $self->{set_nc}->($self);
4169     }
4170    
4171     redo A;
4172     }
4173    
4174     ## ISSUE: "text tokens" in spec.
4175     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4176 wakaba 1.10 ## XML5: "CDATA bracket state".
4177    
4178 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4179    
4180     $self->{state} = CDATA_SECTION_MSE2_STATE;
4181    
4182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4183     $self->{line_prev} = $self->{line};
4184     $self->{column_prev} = $self->{column};
4185     $self->{column}++;
4186     $self->{nc}
4187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4188     } else {
4189     $self->{set_nc}->($self);
4190     }
4191    
4192     redo A;
4193     } else {
4194    
4195 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4196 wakaba 1.1 $self->{ct}->{data} .= ']';
4197 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4198 wakaba 1.1 ## Reconsume.
4199     redo A;
4200     }
4201     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4202 wakaba 1.10 ## XML5: "CDATA end state".
4203    
4204 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4205     $self->{state} = DATA_STATE;
4206 wakaba 1.5 $self->{s_kwd} = '';
4207 wakaba 1.1
4208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4209     $self->{line_prev} = $self->{line};
4210     $self->{column_prev} = $self->{column};
4211     $self->{column}++;
4212     $self->{nc}
4213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4214     } else {
4215     $self->{set_nc}->($self);
4216     }
4217    
4218     if (length $self->{ct}->{data}) { # character
4219    
4220     return ($self->{ct}); # character
4221     } else {
4222    
4223     ## No token to emit. $self->{ct} is discarded.
4224     }
4225     redo A;
4226     } elsif ($self->{nc} == 0x005D) { # ]
4227     # character
4228     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4229     ## Stay in the state.
4230    
4231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232     $self->{line_prev} = $self->{line};
4233     $self->{column_prev} = $self->{column};
4234     $self->{column}++;
4235     $self->{nc}
4236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4237     } else {
4238     $self->{set_nc}->($self);
4239     }
4240    
4241     redo A;
4242     } else {
4243    
4244     $self->{ct}->{data} .= ']]'; # character
4245     $self->{state} = CDATA_SECTION_STATE;
4246 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4247 wakaba 1.1 redo A;
4248     }
4249     } elsif ($self->{state} == ENTITY_STATE) {
4250     if ($is_space->{$self->{nc}} or
4251     {
4252     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4253     $self->{entity_add} => 1,
4254     }->{$self->{nc}}) {
4255    
4256     ## Don't consume
4257     ## No error
4258     ## Return nothing.
4259     #
4260     } elsif ($self->{nc} == 0x0023) { # #
4261    
4262     $self->{state} = ENTITY_HASH_STATE;
4263 wakaba 1.12 $self->{kwd} = '#';
4264 wakaba 1.1
4265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4266     $self->{line_prev} = $self->{line};
4267     $self->{column_prev} = $self->{column};
4268     $self->{column}++;
4269     $self->{nc}
4270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4271     } else {
4272     $self->{set_nc}->($self);
4273     }
4274    
4275     redo A;
4276     } elsif ((0x0041 <= $self->{nc} and
4277     $self->{nc} <= 0x005A) or # A..Z
4278     (0x0061 <= $self->{nc} and
4279     $self->{nc} <= 0x007A)) { # a..z
4280    
4281     require Whatpm::_NamedEntityList;
4282     $self->{state} = ENTITY_NAME_STATE;
4283 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4284     $self->{entity__value} = $self->{kwd};
4285 wakaba 1.1 $self->{entity__match} = 0;
4286    
4287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4288     $self->{line_prev} = $self->{line};
4289     $self->{column_prev} = $self->{column};
4290     $self->{column}++;
4291     $self->{nc}
4292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4293     } else {
4294     $self->{set_nc}->($self);
4295     }
4296    
4297     redo A;
4298     } else {
4299    
4300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4301     ## Return nothing.
4302     #
4303     }
4304    
4305     ## NOTE: No character is consumed by the "consume a character
4306     ## reference" algorithm. In other word, there is an "&" character
4307     ## that does not introduce a character reference, which would be
4308     ## appended to the parent element or the attribute value in later
4309     ## process of the tokenizer.
4310    
4311     if ($self->{prev_state} == DATA_STATE) {
4312    
4313     $self->{state} = $self->{prev_state};
4314 wakaba 1.5 $self->{s_kwd} = '';
4315 wakaba 1.1 ## Reconsume.
4316     return ({type => CHARACTER_TOKEN, data => '&',
4317     line => $self->{line_prev},
4318     column => $self->{column_prev},
4319     });
4320     redo A;
4321     } else {
4322    
4323     $self->{ca}->{value} .= '&';
4324     $self->{state} = $self->{prev_state};
4325 wakaba 1.5 $self->{s_kwd} = '';
4326 wakaba 1.1 ## Reconsume.
4327     redo A;
4328     }
4329     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4330     if ($self->{nc} == 0x0078 or # x
4331     $self->{nc} == 0x0058) { # X
4332    
4333     $self->{state} = HEXREF_X_STATE;
4334 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4335 wakaba 1.1
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     } elsif (0x0030 <= $self->{nc} and
4348     $self->{nc} <= 0x0039) { # 0..9
4349    
4350     $self->{state} = NCR_NUM_STATE;
4351 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4352 wakaba 1.1
4353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354     $self->{line_prev} = $self->{line};
4355     $self->{column_prev} = $self->{column};
4356     $self->{column}++;
4357     $self->{nc}
4358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359     } else {
4360     $self->{set_nc}->($self);
4361     }
4362    
4363     redo A;
4364     } else {
4365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4366     line => $self->{line_prev},
4367     column => $self->{column_prev} - 1);
4368    
4369     ## NOTE: According to the spec algorithm, nothing is returned,
4370     ## and then "&#" is appended to the parent element or the attribute
4371     ## value in the later processing.
4372    
4373     if ($self->{prev_state} == DATA_STATE) {
4374    
4375     $self->{state} = $self->{prev_state};
4376 wakaba 1.5 $self->{s_kwd} = '';
4377 wakaba 1.1 ## Reconsume.
4378     return ({type => CHARACTER_TOKEN,
4379     data => '&#',
4380     line => $self->{line_prev},
4381     column => $self->{column_prev} - 1,
4382     });
4383     redo A;
4384     } else {
4385    
4386     $self->{ca}->{value} .= '&#';
4387     $self->{state} = $self->{prev_state};
4388 wakaba 1.5 $self->{s_kwd} = '';
4389 wakaba 1.1 ## Reconsume.
4390     redo A;
4391     }
4392     }
4393     } elsif ($self->{state} == NCR_NUM_STATE) {
4394     if (0x0030 <= $self->{nc} and
4395     $self->{nc} <= 0x0039) { # 0..9
4396    
4397 wakaba 1.12 $self->{kwd} *= 10;
4398     $self->{kwd} += $self->{nc} - 0x0030;
4399 wakaba 1.1
4400     ## Stay in the state.
4401    
4402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403     $self->{line_prev} = $self->{line};
4404     $self->{column_prev} = $self->{column};
4405     $self->{column}++;
4406     $self->{nc}
4407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408     } else {
4409     $self->{set_nc}->($self);
4410     }
4411    
4412     redo A;
4413     } elsif ($self->{nc} == 0x003B) { # ;
4414    
4415    
4416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4417     $self->{line_prev} = $self->{line};
4418     $self->{column_prev} = $self->{column};
4419     $self->{column}++;
4420     $self->{nc}
4421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4422     } else {
4423     $self->{set_nc}->($self);
4424     }
4425    
4426     #
4427     } else {
4428    
4429     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4430     ## Reconsume.
4431     #
4432     }
4433    
4434 wakaba 1.12 my $code = $self->{kwd};
4435 wakaba 1.1 my $l = $self->{line_prev};
4436     my $c = $self->{column_prev};
4437     if ($charref_map->{$code}) {
4438    
4439     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4440     text => (sprintf 'U+%04X', $code),
4441     line => $l, column => $c);
4442     $code = $charref_map->{$code};
4443     } elsif ($code > 0x10FFFF) {
4444    
4445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4446     text => (sprintf 'U-%08X', $code),
4447     line => $l, column => $c);
4448     $code = 0xFFFD;
4449     }
4450    
4451     if ($self->{prev_state} == DATA_STATE) {
4452    
4453     $self->{state} = $self->{prev_state};
4454 wakaba 1.5 $self->{s_kwd} = '';
4455 wakaba 1.1 ## Reconsume.
4456     return ({type => CHARACTER_TOKEN, data => chr $code,
4457 wakaba 1.7 has_reference => 1,
4458 wakaba 1.1 line => $l, column => $c,
4459     });
4460     redo A;
4461     } else {
4462    
4463     $self->{ca}->{value} .= chr $code;
4464     $self->{ca}->{has_reference} = 1;
4465     $self->{state} = $self->{prev_state};
4466 wakaba 1.5 $self->{s_kwd} = '';
4467 wakaba 1.1 ## Reconsume.
4468     redo A;
4469     }
4470     } elsif ($self->{state} == HEXREF_X_STATE) {
4471     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4472     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4473     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4474     # 0..9, A..F, a..f
4475    
4476     $self->{state} = HEXREF_HEX_STATE;
4477 wakaba 1.12 $self->{kwd} = 0;
4478 wakaba 1.1 ## Reconsume.
4479     redo A;
4480     } else {
4481     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4482     line => $self->{line_prev},
4483     column => $self->{column_prev} - 2);
4484    
4485     ## NOTE: According to the spec algorithm, nothing is returned,
4486     ## and then "&#" followed by "X" or "x" is appended to the parent
4487     ## element or the attribute value in the later processing.
4488    
4489     if ($self->{prev_state} == DATA_STATE) {
4490    
4491     $self->{state} = $self->{prev_state};
4492 wakaba 1.5 $self->{s_kwd} = '';
4493 wakaba 1.1 ## Reconsume.
4494     return ({type => CHARACTER_TOKEN,
4495 wakaba 1.12 data => '&' . $self->{kwd},
4496 wakaba 1.1 line => $self->{line_prev},
4497 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4498 wakaba 1.1 });
4499     redo A;
4500     } else {
4501    
4502 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4503 wakaba 1.1 $self->{state} = $self->{prev_state};
4504 wakaba 1.5 $self->{s_kwd} = '';
4505 wakaba 1.1 ## Reconsume.
4506     redo A;
4507     }
4508     }
4509     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4510     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4511     # 0..9
4512    
4513 wakaba 1.12 $self->{kwd} *= 0x10;
4514     $self->{kwd} += $self->{nc} - 0x0030;
4515 wakaba 1.1 ## Stay in the state.
4516    
4517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4518     $self->{line_prev} = $self->{line};
4519     $self->{column_prev} = $self->{column};
4520     $self->{column}++;
4521     $self->{nc}
4522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4523     } else {
4524     $self->{set_nc}->($self);
4525     }
4526    
4527     redo A;
4528     } elsif (0x0061 <= $self->{nc} and
4529     $self->{nc} <= 0x0066) { # a..f
4530    
4531 wakaba 1.12 $self->{kwd} *= 0x10;
4532     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4533 wakaba 1.1 ## Stay in the state.
4534    
4535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4536     $self->{line_prev} = $self->{line};
4537     $self->{column_prev} = $self->{column};
4538     $self->{column}++;
4539     $self->{nc}
4540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4541     } else {
4542     $self->{set_nc}->($self);
4543     }
4544    
4545     redo A;
4546     } elsif (0x0041 <= $self->{nc} and
4547     $self->{nc} <= 0x0046) { # A..F
4548    
4549 wakaba 1.12 $self->{kwd} *= 0x10;
4550     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4551 wakaba 1.1 ## Stay in the state.
4552    
4553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554     $self->{line_prev} = $self->{line};
4555     $self->{column_prev} = $self->{column};
4556     $self->{column}++;
4557     $self->{nc}
4558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559     } else {
4560     $self->{set_nc}->($self);
4561     }
4562    
4563     redo A;
4564     } elsif ($self->{nc} == 0x003B) { # ;
4565    
4566    
4567     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4568     $self->{line_prev} = $self->{line};
4569     $self->{column_prev} = $self->{column};
4570     $self->{column}++;
4571     $self->{nc}
4572     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4573     } else {
4574     $self->{set_nc}->($self);
4575     }
4576    
4577     #
4578     } else {
4579    
4580     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4581     line => $self->{line},
4582     column => $self->{column});
4583     ## Reconsume.
4584     #
4585     }
4586    
4587 wakaba 1.12 my $code = $self->{kwd};
4588 wakaba 1.1 my $l = $self->{line_prev};
4589     my $c = $self->{column_prev};
4590     if ($charref_map->{$code}) {
4591    
4592     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4593     text => (sprintf 'U+%04X', $code),
4594     line => $l, column => $c);
4595     $code = $charref_map->{$code};
4596     } elsif ($code > 0x10FFFF) {
4597    
4598     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4599     text => (sprintf 'U-%08X', $code),
4600     line => $l, column => $c);
4601     $code = 0xFFFD;
4602     }
4603    
4604     if ($self->{prev_state} == DATA_STATE) {
4605    
4606     $self->{state} = $self->{prev_state};
4607 wakaba 1.5 $self->{s_kwd} = '';
4608 wakaba 1.1 ## Reconsume.
4609     return ({type => CHARACTER_TOKEN, data => chr $code,
4610 wakaba 1.7 has_reference => 1,
4611 wakaba 1.1 line => $l, column => $c,
4612     });
4613     redo A;
4614     } else {
4615    
4616     $self->{ca}->{value} .= chr $code;
4617     $self->{ca}->{has_reference} = 1;
4618     $self->{state} = $self->{prev_state};
4619 wakaba 1.5 $self->{s_kwd} = '';
4620 wakaba 1.1 ## Reconsume.
4621     redo A;
4622     }
4623     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4624 wakaba 1.12 if (length $self->{kwd} < 30 and
4625 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4626     ((0x0041 <= $self->{nc} and # a
4627     $self->{nc} <= 0x005A) or # x
4628     (0x0061 <= $self->{nc} and # a
4629     $self->{nc} <= 0x007A) or # z
4630     (0x0030 <= $self->{nc} and # 0
4631     $self->{nc} <= 0x0039) or # 9
4632     $self->{nc} == 0x003B)) { # ;
4633     our $EntityChar;
4634 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4635     if (defined $EntityChar->{$self->{kwd}}) {
4636 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4637    
4638 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4639 wakaba 1.1 $self->{entity__match} = 1;
4640    
4641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642     $self->{line_prev} = $self->{line};
4643     $self->{column_prev} = $self->{column};
4644     $self->{column}++;
4645     $self->{nc}
4646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647     } else {
4648     $self->{set_nc}->($self);
4649     }
4650    
4651     #
4652     } else {
4653    
4654 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4655 wakaba 1.1 $self->{entity__match} = -1;
4656     ## Stay in the state.
4657    
4658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4659     $self->{line_prev} = $self->{line};
4660     $self->{column_prev} = $self->{column};
4661     $self->{column}++;
4662     $self->{nc}
4663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4664     } else {
4665     $self->{set_nc}->($self);
4666     }
4667    
4668     redo A;
4669     }
4670     } else {
4671    
4672     $self->{entity__value} .= chr $self->{nc};
4673     $self->{entity__match} *= 2;
4674     ## Stay in the state.
4675    
4676     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4677     $self->{line_prev} = $self->{line};
4678     $self->{column_prev} = $self->{column};
4679     $self->{column}++;
4680     $self->{nc}
4681     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4682     } else {
4683     $self->{set_nc}->($self);
4684     }
4685    
4686     redo A;
4687     }
4688     }
4689    
4690     my $data;
4691     my $has_ref;
4692     if ($self->{entity__match} > 0) {
4693    
4694     $data = $self->{entity__value};
4695     $has_ref = 1;
4696     #
4697     } elsif ($self->{entity__match} < 0) {
4698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4699     if ($self->{prev_state} != DATA_STATE and # in attribute
4700     $self->{entity__match} < -1) {
4701    
4702 wakaba 1.12 $data = '&' . $self->{kwd};
4703 wakaba 1.1 #
4704     } else {
4705    
4706     $data = $self->{entity__value};
4707     $has_ref = 1;
4708     #
4709     }
4710     } else {
4711    
4712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4713     line => $self->{line_prev},
4714 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
4715     $data = '&' . $self->{kwd};
4716 wakaba 1.1 #
4717     }
4718    
4719     ## NOTE: In these cases, when a character reference is found,
4720     ## it is consumed and a character token is returned, or, otherwise,
4721     ## nothing is consumed and returned, according to the spec algorithm.
4722     ## In this implementation, anything that has been examined by the
4723     ## tokenizer is appended to the parent element or the attribute value
4724     ## as string, either literal string when no character reference or
4725     ## entity-replaced string otherwise, in this stage, since any characters
4726     ## that would not be consumed are appended in the data state or in an
4727     ## appropriate attribute value state anyway.
4728    
4729     if ($self->{prev_state} == DATA_STATE) {
4730    
4731     $self->{state} = $self->{prev_state};
4732 wakaba 1.5 $self->{s_kwd} = '';
4733 wakaba 1.1 ## Reconsume.
4734     return ({type => CHARACTER_TOKEN,
4735     data => $data,
4736 wakaba 1.7 has_reference => $has_ref,
4737 wakaba 1.1 line => $self->{line_prev},
4738 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
4739 wakaba 1.1 });
4740     redo A;
4741     } else {
4742    
4743     $self->{ca}->{value} .= $data;
4744     $self->{ca}->{has_reference} = 1 if $has_ref;
4745     $self->{state} = $self->{prev_state};
4746 wakaba 1.5 $self->{s_kwd} = '';
4747 wakaba 1.1 ## Reconsume.
4748     redo A;
4749     }
4750 wakaba 1.8
4751     ## XML-only states
4752    
4753     } elsif ($self->{state} == PI_STATE) {
4754 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
4755    
4756 wakaba 1.8 if ($is_space->{$self->{nc}} or
4757 wakaba 1.14 $self->{nc} == 0x003F or # ?
4758 wakaba 1.8 $self->{nc} == -1) {
4759 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4760     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
4761     ## "DOCTYPE pi state": Parse error, switch to the "data
4762     ## state".
4763 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4764     line => $self->{line_prev},
4765     column => $self->{column_prev}
4766     - 1 * ($self->{nc} != -1));
4767     $self->{state} = BOGUS_COMMENT_STATE;
4768     ## Reconsume.
4769     $self->{ct} = {type => COMMENT_TOKEN,
4770     data => '?',
4771     line => $self->{line_prev},
4772     column => $self->{column_prev}
4773     - 1 * ($self->{nc} != -1),
4774     };
4775     redo A;
4776     } else {
4777 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
4778 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
4779     target => chr $self->{nc},
4780     data => '',
4781     line => $self->{line_prev},
4782     column => $self->{column_prev} - 1,
4783     };
4784     $self->{state} = PI_TARGET_STATE;
4785    
4786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4787     $self->{line_prev} = $self->{line};
4788     $self->{column_prev} = $self->{column};
4789     $self->{column}++;
4790     $self->{nc}
4791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4792     } else {
4793     $self->{set_nc}->($self);
4794     }
4795    
4796     redo A;
4797     }
4798     } elsif ($self->{state} == PI_TARGET_STATE) {
4799     if ($is_space->{$self->{nc}}) {
4800     $self->{state} = PI_TARGET_AFTER_STATE;
4801    
4802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4803     $self->{line_prev} = $self->{line};
4804     $self->{column_prev} = $self->{column};
4805     $self->{column}++;
4806     $self->{nc}
4807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4808     } else {
4809     $self->{set_nc}->($self);
4810     }
4811    
4812     redo A;
4813     } elsif ($self->{nc} == -1) {
4814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4815 wakaba 1.13 if ($self->{in_subset}) {
4816     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4817     } else {
4818     $self->{state} = DATA_STATE;
4819     $self->{s_kwd} = '';
4820     }
4821 wakaba 1.8 ## Reconsume.
4822     return ($self->{ct}); # pi
4823     redo A;
4824     } elsif ($self->{nc} == 0x003F) { # ?
4825     $self->{state} = PI_AFTER_STATE;
4826    
4827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4828     $self->{line_prev} = $self->{line};
4829     $self->{column_prev} = $self->{column};
4830     $self->{column}++;
4831     $self->{nc}
4832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4833     } else {
4834     $self->{set_nc}->($self);
4835     }
4836    
4837     redo A;
4838     } else {
4839     ## XML5: typo ("tag name" -> "target")
4840     $self->{ct}->{target} .= chr $self->{nc}; # pi
4841    
4842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4843     $self->{line_prev} = $self->{line};
4844     $self->{column_prev} = $self->{column};
4845     $self->{column}++;
4846     $self->{nc}
4847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4848     } else {
4849     $self->{set_nc}->($self);
4850     }
4851    
4852     redo A;
4853     }
4854     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4855     if ($is_space->{$self->{nc}}) {
4856     ## Stay in the state.
4857    
4858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4859     $self->{line_prev} = $self->{line};
4860     $self->{column_prev} = $self->{column};
4861     $self->{column}++;
4862     $self->{nc}
4863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4864     } else {
4865     $self->{set_nc}->($self);
4866     }
4867    
4868     redo A;
4869     } else {
4870     $self->{state} = PI_DATA_STATE;
4871     ## Reprocess.
4872     redo A;
4873     }
4874     } elsif ($self->{state} == PI_DATA_STATE) {
4875     if ($self->{nc} == 0x003F) { # ?
4876     $self->{state} = PI_DATA_AFTER_STATE;
4877    
4878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4879     $self->{line_prev} = $self->{line};
4880     $self->{column_prev} = $self->{column};
4881     $self->{column}++;
4882     $self->{nc}
4883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4884     } else {
4885     $self->{set_nc}->($self);
4886     }
4887    
4888     redo A;
4889     } elsif ($self->{nc} == -1) {
4890     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4891 wakaba 1.13 if ($self->{in_subset}) {
4892 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
4893 wakaba 1.13 } else {
4894     $self->{state} = DATA_STATE;
4895     $self->{s_kwd} = '';
4896     }
4897 wakaba 1.8 ## Reprocess.
4898     return ($self->{ct}); # pi
4899     redo A;
4900     } else {
4901     $self->{ct}->{data} .= chr $self->{nc}; # pi
4902     $self->{read_until}->($self->{ct}->{data}, q[?],
4903     length $self->{ct}->{data});
4904     ## Stay in the state.
4905    
4906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4907     $self->{line_prev} = $self->{line};
4908     $self->{column_prev} = $self->{column};
4909     $self->{column}++;
4910     $self->{nc}
4911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4912     } else {
4913     $self->{set_nc}->($self);
4914     }
4915    
4916     ## Reprocess.
4917     redo A;
4918     }
4919     } elsif ($self->{state} == PI_AFTER_STATE) {
4920 wakaba 1.14 ## XML5: Part of "Pi after state".
4921    
4922 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
4923 wakaba 1.13 if ($self->{in_subset}) {
4924     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4925     } else {
4926     $self->{state} = DATA_STATE;
4927     $self->{s_kwd} = '';
4928     }
4929 wakaba 1.8
4930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4931     $self->{line_prev} = $self->{line};
4932     $self->{column_prev} = $self->{column};
4933     $self->{column}++;
4934     $self->{nc}
4935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4936     } else {
4937     $self->{set_nc}->($self);
4938     }
4939    
4940     return ($self->{ct}); # pi
4941     redo A;
4942     } elsif ($self->{nc} == 0x003F) { # ?
4943     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4944     line => $self->{line_prev},
4945     column => $self->{column_prev}); ## XML5: no error
4946     $self->{ct}->{data} .= '?';
4947     $self->{state} = PI_DATA_AFTER_STATE;
4948    
4949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4950     $self->{line_prev} = $self->{line};
4951     $self->{column_prev} = $self->{column};
4952     $self->{column}++;
4953     $self->{nc}
4954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4955     } else {
4956     $self->{set_nc}->($self);
4957     }
4958    
4959     redo A;
4960     } else {
4961     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4962     line => $self->{line_prev},
4963     column => $self->{column_prev}
4964     + 1 * ($self->{nc} == -1)); ## XML5: no error
4965     $self->{ct}->{data} .= '?'; ## XML5: not appended
4966     $self->{state} = PI_DATA_STATE;
4967     ## Reprocess.
4968     redo A;
4969     }
4970     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4971 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
4972    
4973 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
4974 wakaba 1.13 if ($self->{in_subset}) {
4975     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4976     } else {
4977     $self->{state} = DATA_STATE;
4978     $self->{s_kwd} = '';
4979     }
4980 wakaba 1.8
4981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4982     $self->{line_prev} = $self->{line};
4983     $self->{column_prev} = $self->{column};
4984     $self->{column}++;
4985     $self->{nc}
4986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4987     } else {
4988     $self->{set_nc}->($self);
4989     }
4990    
4991     return ($self->{ct}); # pi
4992     redo A;
4993     } elsif ($self->{nc} == 0x003F) { # ?
4994     $self->{ct}->{data} .= '?';
4995     ## Stay in the state.
4996    
4997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4998     $self->{line_prev} = $self->{line};
4999     $self->{column_prev} = $self->{column};
5000     $self->{column}++;
5001     $self->{nc}
5002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5003     } else {
5004     $self->{set_nc}->($self);
5005     }
5006    
5007     redo A;
5008     } else {
5009     $self->{ct}->{data} .= '?'; ## XML5: not appended
5010     $self->{state} = PI_DATA_STATE;
5011     ## Reprocess.
5012     redo A;
5013     }
5014 wakaba 1.12
5015     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5016     if ($self->{nc} == 0x003C) { # <
5017 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5018 wakaba 1.12
5019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5020     $self->{line_prev} = $self->{line};
5021     $self->{column_prev} = $self->{column};
5022     $self->{column}++;
5023     $self->{nc}
5024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5025     } else {
5026     $self->{set_nc}->($self);
5027     }
5028    
5029     redo A;
5030     } elsif ($self->{nc} == 0x0025) { # %
5031     ## XML5: Not defined yet.
5032    
5033     ## TODO:
5034    
5035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5036     $self->{line_prev} = $self->{line};
5037     $self->{column_prev} = $self->{column};
5038     $self->{column}++;
5039     $self->{nc}
5040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5041     } else {
5042     $self->{set_nc}->($self);
5043     }
5044    
5045     redo A;
5046     } elsif ($self->{nc} == 0x005D) { # ]
5047 wakaba 1.13 delete $self->{in_subset};
5048 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5049    
5050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5051     $self->{line_prev} = $self->{line};
5052     $self->{column_prev} = $self->{column};
5053     $self->{column}++;
5054     $self->{nc}
5055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5056     } else {
5057     $self->{set_nc}->($self);
5058     }
5059    
5060     redo A;
5061     } elsif ($is_space->{$self->{nc}}) {
5062     ## Stay in the state.
5063    
5064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5065     $self->{line_prev} = $self->{line};
5066     $self->{column_prev} = $self->{column};
5067     $self->{column}++;
5068     $self->{nc}
5069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5070     } else {
5071     $self->{set_nc}->($self);
5072     }
5073    
5074     redo A;
5075     } elsif ($self->{nc} == -1) {
5076     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5077 wakaba 1.13 delete $self->{in_subset};
5078 wakaba 1.12 $self->{state} = DATA_STATE;
5079     $self->{s_kwd} = '';
5080     ## Reconsume.
5081 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5082 wakaba 1.12 redo A;
5083     } else {
5084     unless ($self->{internal_subset_tainted}) {
5085     ## XML5: No parse error.
5086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5087     $self->{internal_subset_tainted} = 1;
5088     }
5089     ## Stay in the state.
5090    
5091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5092     $self->{line_prev} = $self->{line};
5093     $self->{column_prev} = $self->{column};
5094     $self->{column}++;
5095     $self->{nc}
5096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5097     } else {
5098     $self->{set_nc}->($self);
5099     }
5100    
5101     redo A;
5102     }
5103     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5104     if ($self->{nc} == 0x003E) { # >
5105     $self->{state} = DATA_STATE;
5106     $self->{s_kwd} = '';
5107    
5108     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5109     $self->{line_prev} = $self->{line};
5110     $self->{column_prev} = $self->{column};
5111     $self->{column}++;
5112     $self->{nc}
5113     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5114     } else {
5115     $self->{set_nc}->($self);
5116     }
5117    
5118 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5119 wakaba 1.12 redo A;
5120     } elsif ($self->{nc} == -1) {
5121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5122     $self->{state} = DATA_STATE;
5123     $self->{s_kwd} = '';
5124     ## Reconsume.
5125 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5126 wakaba 1.12 redo A;
5127     } else {
5128     ## XML5: No parse error and stay in the state.
5129     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5130    
5131 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5132    
5133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5134     $self->{line_prev} = $self->{line};
5135     $self->{column_prev} = $self->{column};
5136     $self->{column}++;
5137     $self->{nc}
5138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5139     } else {
5140     $self->{set_nc}->($self);
5141     }
5142    
5143     redo A;
5144     }
5145     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5146     if ($self->{nc} == 0x003E) { # >
5147     $self->{state} = DATA_STATE;
5148     $self->{s_kwd} = '';
5149    
5150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5151     $self->{line_prev} = $self->{line};
5152     $self->{column_prev} = $self->{column};
5153     $self->{column}++;
5154     $self->{nc}
5155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5156     } else {
5157     $self->{set_nc}->($self);
5158     }
5159    
5160     return ({type => END_OF_DOCTYPE_TOKEN});
5161     redo A;
5162     } elsif ($self->{nc} == -1) {
5163     $self->{state} = DATA_STATE;
5164     $self->{s_kwd} = '';
5165     ## Reconsume.
5166     return ({type => END_OF_DOCTYPE_TOKEN});
5167     redo A;
5168     } else {
5169     ## Stay in the state.
5170    
5171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5172     $self->{line_prev} = $self->{line};
5173     $self->{column_prev} = $self->{column};
5174     $self->{column}++;
5175     $self->{nc}
5176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5177     } else {
5178     $self->{set_nc}->($self);
5179     }
5180    
5181     redo A;
5182     }
5183     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5184     if ($self->{nc} == 0x0021) { # !
5185 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5186 wakaba 1.13
5187     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5188     $self->{line_prev} = $self->{line};
5189     $self->{column_prev} = $self->{column};
5190     $self->{column}++;
5191     $self->{nc}
5192     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5193     } else {
5194     $self->{set_nc}->($self);
5195     }
5196    
5197     redo A;
5198     } elsif ($self->{nc} == 0x003F) { # ?
5199     $self->{state} = PI_STATE;
5200    
5201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5202     $self->{line_prev} = $self->{line};
5203     $self->{column_prev} = $self->{column};
5204     $self->{column}++;
5205     $self->{nc}
5206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5207     } else {
5208     $self->{set_nc}->($self);
5209     }
5210    
5211     redo A;
5212     } elsif ($self->{nc} == -1) {
5213     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5214     $self->{state} = DATA_STATE;
5215     $self->{s_kwd} = '';
5216     ## Reconsume.
5217     redo A;
5218     } else {
5219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5220     line => $self->{line_prev},
5221     column => $self->{column_prev});
5222     $self->{state} = BOGUS_COMMENT_STATE;
5223     $self->{ct} = {type => COMMENT_TOKEN,
5224     data => '',
5225     }; ## NOTE: Will be discarded.
5226 wakaba 1.12
5227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5228     $self->{line_prev} = $self->{line};
5229     $self->{column_prev} = $self->{column};
5230     $self->{column}++;
5231     $self->{nc}
5232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5233     } else {
5234     $self->{set_nc}->($self);
5235     }
5236    
5237     redo A;
5238     }
5239 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5240     ## XML5: "DOCTYPE markup declaration state".
5241    
5242     if ($self->{nc} == 0x002D) { # -
5243     $self->{state} = MD_HYPHEN_STATE;
5244    
5245     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5246     $self->{line_prev} = $self->{line};
5247     $self->{column_prev} = $self->{column};
5248     $self->{column}++;
5249     $self->{nc}
5250     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5251     } else {
5252     $self->{set_nc}->($self);
5253     }
5254    
5255     redo A;
5256     } elsif ($self->{nc} == 0x0045) { # E
5257     $self->{state} = MD_E_STATE;
5258     $self->{kwd} = chr $self->{nc};
5259    
5260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5261     $self->{line_prev} = $self->{line};
5262     $self->{column_prev} = $self->{column};
5263     $self->{column}++;
5264     $self->{nc}
5265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5266     } else {
5267     $self->{set_nc}->($self);
5268     }
5269    
5270     redo A;
5271     } elsif ($self->{nc} == 0x0041) { # A
5272     $self->{state} = MD_ATTLIST_STATE;
5273     $self->{kwd} = chr $self->{nc};
5274    
5275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5276     $self->{line_prev} = $self->{line};
5277     $self->{column_prev} = $self->{column};
5278     $self->{column}++;
5279     $self->{nc}
5280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5281     } else {
5282     $self->{set_nc}->($self);
5283     }
5284    
5285     redo A;
5286     } elsif ($self->{nc} == 0x004E) { # N
5287     $self->{state} = MD_NOTATION_STATE;
5288     $self->{kwd} = chr $self->{nc};
5289    
5290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5291     $self->{line_prev} = $self->{line};
5292     $self->{column_prev} = $self->{column};
5293     $self->{column}++;
5294     $self->{nc}
5295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5296     } else {
5297     $self->{set_nc}->($self);
5298     }
5299    
5300     redo A;
5301     } else {
5302     #
5303     }
5304    
5305     ## XML5: No parse error.
5306     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5307     line => $self->{line_prev},
5308     column => $self->{column_prev} - 1);
5309     ## Reconsume.
5310     $self->{state} = BOGUS_COMMENT_STATE;
5311     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5312     redo A;
5313     } elsif ($self->{state} == MD_E_STATE) {
5314     if ($self->{nc} == 0x004E) { # N
5315     $self->{state} = MD_ENTITY_STATE;
5316     $self->{kwd} .= chr $self->{nc};
5317    
5318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319     $self->{line_prev} = $self->{line};
5320     $self->{column_prev} = $self->{column};
5321     $self->{column}++;
5322     $self->{nc}
5323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324     } else {
5325     $self->{set_nc}->($self);
5326     }
5327    
5328     redo A;
5329     } elsif ($self->{nc} == 0x004C) { # L
5330     ## XML5: <!ELEMENT> not supported.
5331     $self->{state} = MD_ELEMENT_STATE;
5332     $self->{kwd} .= chr $self->{nc};
5333    
5334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5335     $self->{line_prev} = $self->{line};
5336     $self->{column_prev} = $self->{column};
5337     $self->{column}++;
5338     $self->{nc}
5339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5340     } else {
5341     $self->{set_nc}->($self);
5342     }
5343    
5344     redo A;
5345     } else {
5346     ## XML5: No parse error.
5347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5348     line => $self->{line_prev},
5349     column => $self->{column_prev} - 2
5350     + 1 * ($self->{nc} == -1));
5351     ## Reconsume.
5352     $self->{state} = BOGUS_COMMENT_STATE;
5353     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5354     redo A;
5355     }
5356     } elsif ($self->{state} == MD_ENTITY_STATE) {
5357     if ($self->{nc} == {
5358     'EN' => 0x0054, # T
5359     'ENT' => 0x0049, # I
5360     'ENTI' => 0x0054, # T
5361     }->{$self->{kwd}}) {
5362     ## Stay in the state.
5363     $self->{kwd} .= chr $self->{nc};
5364    
5365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5366     $self->{line_prev} = $self->{line};
5367     $self->{column_prev} = $self->{column};
5368     $self->{column}++;
5369     $self->{nc}
5370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5371     } else {
5372     $self->{set_nc}->($self);
5373     }
5374    
5375     redo A;
5376     } elsif ($self->{kwd} eq 'ENTIT' and
5377     $self->{nc} == 0x0059) { # Y
5378     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
5379     line => $self->{line_prev},
5380     column => $self->{column_prev} - 6};
5381     $self->{state} = DOCTYPE_MD_STATE;
5382    
5383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384     $self->{line_prev} = $self->{line};
5385     $self->{column_prev} = $self->{column};
5386     $self->{column}++;
5387     $self->{nc}
5388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389     } else {
5390     $self->{set_nc}->($self);
5391     }
5392    
5393     redo A;
5394     } else {
5395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5396     line => $self->{line_prev},
5397     column => $self->{column_prev} - 1
5398     - (length $self->{kwd})
5399     + 1 * ($self->{nc} == -1));
5400     $self->{state} = BOGUS_COMMENT_STATE;
5401     ## Reconsume.
5402     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5403     redo A;
5404     }
5405     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5406     if ($self->{nc} == {
5407     'EL' => 0x0045, # E
5408     'ELE' => 0x004D, # M
5409     'ELEM' => 0x0045, # E
5410     'ELEME' => 0x004E, # N
5411     }->{$self->{kwd}}) {
5412     ## Stay in the state.
5413     $self->{kwd} .= chr $self->{nc};
5414    
5415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5416     $self->{line_prev} = $self->{line};
5417     $self->{column_prev} = $self->{column};
5418     $self->{column}++;
5419     $self->{nc}
5420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5421     } else {
5422     $self->{set_nc}->($self);
5423     }
5424    
5425     redo A;
5426     } elsif ($self->{kwd} eq 'ELEMEN' and
5427     $self->{nc} == 0x0054) { # T
5428     $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5429     line => $self->{line_prev},
5430     column => $self->{column_prev} - 6};
5431     $self->{state} = DOCTYPE_MD_STATE;
5432    
5433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5434     $self->{line_prev} = $self->{line};
5435     $self->{column_prev} = $self->{column};
5436     $self->{column}++;
5437     $self->{nc}
5438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5439     } else {
5440     $self->{set_nc}->($self);
5441     }
5442    
5443     redo A;
5444     } else {
5445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5446     line => $self->{line_prev},
5447     column => $self->{column_prev} - 1
5448     - (length $self->{kwd})
5449     + 1 * ($self->{nc} == -1));
5450     $self->{state} = BOGUS_COMMENT_STATE;
5451     ## Reconsume.
5452     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5453     redo A;
5454     }
5455     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5456     if ($self->{nc} == {
5457     'A' => 0x0054, # T
5458     'AT' => 0x0054, # T
5459     'ATT' => 0x004C, # L
5460     'ATTL' => 0x0049, # I
5461     'ATTLI' => 0x0053, # S
5462     }->{$self->{kwd}}) {
5463     ## Stay in the state.
5464     $self->{kwd} .= chr $self->{nc};
5465    
5466     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5467     $self->{line_prev} = $self->{line};
5468     $self->{column_prev} = $self->{column};
5469     $self->{column}++;
5470     $self->{nc}
5471     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5472     } else {
5473     $self->{set_nc}->($self);
5474     }
5475    
5476     redo A;
5477     } elsif ($self->{kwd} eq 'ATTLIS' and
5478     $self->{nc} == 0x0054) { # T
5479     $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5480     line => $self->{line_prev},
5481     column => $self->{column_prev} - 6};
5482     $self->{state} = DOCTYPE_MD_STATE;
5483    
5484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485     $self->{line_prev} = $self->{line};
5486     $self->{column_prev} = $self->{column};
5487     $self->{column}++;
5488     $self->{nc}
5489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490     } else {
5491     $self->{set_nc}->($self);
5492     }
5493    
5494     redo A;
5495     } else {
5496     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5497     line => $self->{line_prev},
5498     column => $self->{column_prev} - 1
5499     - (length $self->{kwd})
5500     + 1 * ($self->{nc} == -1));
5501     $self->{state} = BOGUS_COMMENT_STATE;
5502     ## Reconsume.
5503     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5504     redo A;
5505     }
5506     } elsif ($self->{state} == MD_NOTATION_STATE) {
5507     if ($self->{nc} == {
5508     'N' => 0x004F, # O
5509     'NO' => 0x0054, # T
5510     'NOT' => 0x0041, # A
5511     'NOTA' => 0x0054, # T
5512     'NOTAT' => 0x0049, # I
5513     'NOTATI' => 0x004F, # O
5514     }->{$self->{kwd}}) {
5515     ## Stay in the state.
5516     $self->{kwd} .= chr $self->{nc};
5517    
5518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5519     $self->{line_prev} = $self->{line};
5520     $self->{column_prev} = $self->{column};
5521     $self->{column}++;
5522     $self->{nc}
5523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5524     } else {
5525     $self->{set_nc}->($self);
5526     }
5527    
5528     redo A;
5529     } elsif ($self->{kwd} eq 'NOTATIO' and
5530     $self->{nc} == 0x004E) { # N
5531     $self->{ct} = {type => NOTATION_TOKEN, name => '',
5532     line => $self->{line_prev},
5533     column => $self->{column_prev} - 6};
5534     $self->{state} = DOCTYPE_MD_STATE;
5535    
5536     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5537     $self->{line_prev} = $self->{line};
5538     $self->{column_prev} = $self->{column};
5539     $self->{column}++;
5540     $self->{nc}
5541     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5542     } else {
5543     $self->{set_nc}->($self);
5544     }
5545    
5546     redo A;
5547     } else {
5548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5549     line => $self->{line_prev},
5550     column => $self->{column_prev} - 1
5551     - (length $self->{kwd})
5552     + 1 * ($self->{nc} == -1));
5553     $self->{state} = BOGUS_COMMENT_STATE;
5554     ## Reconsume.
5555     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5556     redo A;
5557     }
5558     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5559     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5560     ## "DOCTYPE NOTATION state".
5561    
5562     if ($is_space->{$self->{nc}}) {
5563     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5564     $self->{state} = BEFORE_MD_NAME_STATE;
5565    
5566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5567     $self->{line_prev} = $self->{line};
5568     $self->{column_prev} = $self->{column};
5569     $self->{column}++;
5570     $self->{nc}
5571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5572     } else {
5573     $self->{set_nc}->($self);
5574     }
5575    
5576     redo A;
5577     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5578     $self->{nc} == 0x0025) { # %
5579     ## XML5: Switch to the "DOCTYPE bogus comment state".
5580     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5581     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5582    
5583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5584     $self->{line_prev} = $self->{line};
5585     $self->{column_prev} = $self->{column};
5586     $self->{column}++;
5587     $self->{nc}
5588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5589     } else {
5590     $self->{set_nc}->($self);
5591     }
5592    
5593     redo A;
5594     } elsif ($self->{nc} == -1) {
5595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5597     ## Reconsume.
5598     redo A;
5599     } elsif ($self->{nc} == 0x003E) { # >
5600     ## XML5: Switch to the "DOCTYPE bogus comment state".
5601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5602     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5603    
5604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5605     $self->{line_prev} = $self->{line};
5606     $self->{column_prev} = $self->{column};
5607     $self->{column}++;
5608     $self->{nc}
5609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5610     } else {
5611     $self->{set_nc}->($self);
5612     }
5613    
5614     redo A;
5615     } else {
5616     ## XML5: Switch to the "DOCTYPE bogus comment state".
5617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5618     $self->{state} = BEFORE_MD_NAME_STATE;
5619     redo A;
5620     }
5621     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5622     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5623     ## before state", "DOCTYPE ATTLIST name before state".
5624    
5625     if ($is_space->{$self->{nc}}) {
5626     ## Stay in the state.
5627    
5628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5629     $self->{line_prev} = $self->{line};
5630     $self->{column_prev} = $self->{column};
5631     $self->{column}++;
5632     $self->{nc}
5633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5634     } else {
5635     $self->{set_nc}->($self);
5636     }
5637    
5638     redo A;
5639     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5640     $self->{nc} == 0x0025) { # %
5641     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5642    
5643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5644     $self->{line_prev} = $self->{line};
5645     $self->{column_prev} = $self->{column};
5646     $self->{column}++;
5647     $self->{nc}
5648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5649     } else {
5650     $self->{set_nc}->($self);
5651     }
5652    
5653     redo A;
5654     } elsif ($self->{nc} == 0x003E) { # >
5655     ## XML5: Same as "Anything else".
5656     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5657     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5658    
5659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5660     $self->{line_prev} = $self->{line};
5661     $self->{column_prev} = $self->{column};
5662     $self->{column}++;
5663     $self->{nc}
5664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5665     } else {
5666     $self->{set_nc}->($self);
5667     }
5668    
5669     redo A;
5670     } elsif ($self->{nc} == -1) {
5671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5672     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5673     ## Reconsume.
5674     redo A;
5675     } else {
5676     ## XML5: [ATTLIST] Not defined yet.
5677     $self->{ct}->{name} .= chr $self->{nc};
5678     $self->{state} = MD_NAME_STATE;
5679    
5680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5681     $self->{line_prev} = $self->{line};
5682     $self->{column_prev} = $self->{column};
5683     $self->{column}++;
5684     $self->{nc}
5685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5686     } else {
5687     $self->{set_nc}->($self);
5688     }
5689    
5690     redo A;
5691     }
5692     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5693     if ($is_space->{$self->{nc}}) {
5694     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5695     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5696     $self->{state} = BEFORE_MD_NAME_STATE;
5697 wakaba 1.8
5698 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5699     $self->{line_prev} = $self->{line};
5700     $self->{column_prev} = $self->{column};
5701     $self->{column}++;
5702     $self->{nc}
5703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5704     } else {
5705     $self->{set_nc}->($self);
5706     }
5707    
5708     redo A;
5709     } elsif ($self->{nc} == 0x003E) { # >
5710     ## XML5: Same as "Anything else".
5711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5712     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5713    
5714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5715     $self->{line_prev} = $self->{line};
5716     $self->{column_prev} = $self->{column};
5717     $self->{column}++;
5718     $self->{nc}
5719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5720     } else {
5721     $self->{set_nc}->($self);
5722     }
5723    
5724     redo A;
5725     } elsif ($self->{nc} == -1) {
5726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5727     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5728     ## Reconsume.
5729     redo A;
5730     } else {
5731     ## XML5: No parse error.
5732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
5733     $self->{state} = BOGUS_COMMENT_STATE;
5734     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5735     ## Reconsume.
5736     redo A;
5737     }
5738     } elsif ($self->{state} == MD_NAME_STATE) {
5739     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
5740    
5741     if ($is_space->{$self->{nc}}) {
5742     ## TODO:
5743     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5744    
5745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746     $self->{line_prev} = $self->{line};
5747     $self->{column_prev} = $self->{column};
5748     $self->{column}++;
5749     $self->{nc}
5750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751     } else {
5752     $self->{set_nc}->($self);
5753     }
5754    
5755     redo A;
5756     } elsif ($self->{nc} == 0x003E) { # >
5757     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5758     #
5759     } else {
5760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md body'); ## TODO: type
5761     }
5762     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5763    
5764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5765     $self->{line_prev} = $self->{line};
5766     $self->{column_prev} = $self->{column};
5767     $self->{column}++;
5768     $self->{nc}
5769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5770     } else {
5771     $self->{set_nc}->($self);
5772     }
5773    
5774     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5775     redo A;
5776     } elsif ($self->{nc} == -1) {
5777     ## XML5: [ATTLIST] No parse error.
5778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5780     ## Reconsume.
5781     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5782     redo A;
5783     } else {
5784     ## XML5: [ATTLIST] Not defined yet.
5785     $self->{ct}->{name} .= chr $self->{nc};
5786     ## Stay in the state.
5787    
5788     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5789     $self->{line_prev} = $self->{line};
5790     $self->{column_prev} = $self->{column};
5791     $self->{column}++;
5792     $self->{nc}
5793     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5794     } else {
5795     $self->{set_nc}->($self);
5796     }
5797    
5798     redo A;
5799     }
5800     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
5801     if ($is_space->{$self->{nc}}) {
5802     ## Stay in the state.
5803    
5804     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5805     $self->{line_prev} = $self->{line};
5806     $self->{column_prev} = $self->{column};
5807     $self->{column}++;
5808     $self->{nc}
5809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5810     } else {
5811     $self->{set_nc}->($self);
5812     }
5813    
5814     redo A;
5815     } elsif ($self->{nc} == 0x003E) { # >
5816     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5817    
5818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5819     $self->{line_prev} = $self->{line};
5820     $self->{column_prev} = $self->{column};
5821     $self->{column}++;
5822     $self->{nc}
5823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5824     } else {
5825     $self->{set_nc}->($self);
5826     }
5827    
5828     return ($self->{ct}); # ATTLIST
5829     redo A;
5830     } elsif ($self->{nc} == -1) {
5831     ## XML5: No parse error.
5832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5833     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5834     redo A;
5835     } else {
5836     ## XML5: Not defined yet.
5837    
5838     ## TODO: ...
5839    
5840     $self->{state} = BOGUS_COMMENT_STATE;
5841     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5842     ## Reconsume.
5843     redo A;
5844     }
5845    
5846 wakaba 1.1 } else {
5847     die "$0: $self->{state}: Unknown state";
5848     }
5849     } # A
5850    
5851     die "$0: _get_next_token: unexpected case";
5852     } # _get_next_token
5853    
5854     1;
5855 wakaba 1.14 ## $Date: 2008/10/16 03:39:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24