/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (hide annotations) (download)
Sat Oct 18 08:05:29 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.14: +1352 -52 lines
++ whatpm/t/xml/ChangeLog	18 Oct 2008 08:05:22 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat: Tests added.

++ whatpm/Whatpm/ChangeLog	18 Oct 2008 08:03:10 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (text_content): Moved to Node from Element.  Setter
	implemented.
	(allowed_tokens, default_type, declared_type): Implemented.

++ whatpm/Whatpm/HTML/ChangeLog	18 Oct 2008 08:04:10 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: <!ATTLIST> in the internal subset of an XML
	document, is now fully implemented.

	* Dumper.pm (dumptree): Output allowed tokens and default value
	always.

++ whatpm/Whatpm/XML/ChangeLog	18 Oct 2008 08:05:03 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): <!ATTLIST> node construction
	implemented.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.15 our $VERSION=do{my @r=(q$Revision: 1.14 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.8
181 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
182     ## list and descriptions)
183    
184     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
185     sub FOREIGN_EL () { 0b1_00000000000 }
186    
187     ## Character reference mappings
188    
189     my $charref_map = {
190     0x0D => 0x000A,
191     0x80 => 0x20AC,
192     0x81 => 0xFFFD,
193     0x82 => 0x201A,
194     0x83 => 0x0192,
195     0x84 => 0x201E,
196     0x85 => 0x2026,
197     0x86 => 0x2020,
198     0x87 => 0x2021,
199     0x88 => 0x02C6,
200     0x89 => 0x2030,
201     0x8A => 0x0160,
202     0x8B => 0x2039,
203     0x8C => 0x0152,
204     0x8D => 0xFFFD,
205     0x8E => 0x017D,
206     0x8F => 0xFFFD,
207     0x90 => 0xFFFD,
208     0x91 => 0x2018,
209     0x92 => 0x2019,
210     0x93 => 0x201C,
211     0x94 => 0x201D,
212     0x95 => 0x2022,
213     0x96 => 0x2013,
214     0x97 => 0x2014,
215     0x98 => 0x02DC,
216     0x99 => 0x2122,
217     0x9A => 0x0161,
218     0x9B => 0x203A,
219     0x9C => 0x0153,
220     0x9D => 0xFFFD,
221     0x9E => 0x017E,
222     0x9F => 0x0178,
223     }; # $charref_map
224     $charref_map->{$_} = 0xFFFD
225     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
226     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
227     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
228     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
229     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
230     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
231     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
232    
233     ## Implementations MUST act as if state machine in the spec
234    
235     sub _initialize_tokenizer ($) {
236     my $self = shift;
237    
238     ## NOTE: Fields set by |new| constructor:
239     #$self->{level}
240     #$self->{set_nc}
241     #$self->{parse_error}
242 wakaba 1.3 #$self->{is_xml} (if XML)
243 wakaba 1.1
244     $self->{state} = DATA_STATE; # MUST
245 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
246     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
247 wakaba 1.1 #$self->{entity__value}; # initialized when used
248     #$self->{entity__match}; # initialized when used
249     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
250     undef $self->{ct}; # current token
251     undef $self->{ca}; # current attribute
252     undef $self->{last_stag_name}; # last emitted start tag name
253     #$self->{prev_state}; # initialized when used
254     delete $self->{self_closing};
255     $self->{char_buffer} = '';
256     $self->{char_buffer_pos} = 0;
257     $self->{nc} = -1; # next input character
258     #$self->{next_nc}
259    
260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
261     $self->{line_prev} = $self->{line};
262     $self->{column_prev} = $self->{column};
263     $self->{column}++;
264     $self->{nc}
265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
266     } else {
267     $self->{set_nc}->($self);
268     }
269    
270     $self->{token} = [];
271     # $self->{escape}
272     } # _initialize_tokenizer
273    
274     ## A token has:
275     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
276 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
277 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
278     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
279 wakaba 1.11 ## ->{target} (PI_TOKEN)
280 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
281     ## ->{sysid} (DOCTYPE_TOKEN)
282     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
283     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
284     ## ->{name}
285     ## ->{value}
286     ## ->{has_reference} == 1 or 0
287 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
288     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
289 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
290 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
291 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
292    
293 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
294     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
295     ## while the token is pushed back to the stack.
296    
297     ## Emitted token MUST immediately be handled by the tree construction state.
298    
299     ## Before each step, UA MAY check to see if either one of the scripts in
300     ## "list of scripts that will execute as soon as possible" or the first
301     ## script in the "list of scripts that will execute asynchronously",
302     ## has completed loading. If one has, then it MUST be executed
303     ## and removed from the list.
304    
305     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
306     ## (This requirement was dropped from HTML5 spec, unfortunately.)
307    
308     my $is_space = {
309     0x0009 => 1, # CHARACTER TABULATION (HT)
310     0x000A => 1, # LINE FEED (LF)
311     #0x000B => 0, # LINE TABULATION (VT)
312 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
313 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
314     0x0020 => 1, # SPACE (SP)
315     };
316    
317     sub _get_next_token ($) {
318     my $self = shift;
319    
320     if ($self->{self_closing}) {
321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
322     ## NOTE: The |self_closing| flag is only set by start tag token.
323     ## In addition, when a start tag token is emitted, it is always set to
324     ## |ct|.
325     delete $self->{self_closing};
326     }
327    
328     if (@{$self->{token}}) {
329     $self->{self_closing} = $self->{token}->[0]->{self_closing};
330     return shift @{$self->{token}};
331     }
332    
333     A: {
334     if ($self->{state} == PCDATA_STATE) {
335     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
336    
337     if ($self->{nc} == 0x0026) { # &
338    
339     ## NOTE: In the spec, the tokenizer is switched to the
340     ## "entity data state". In this implementation, the tokenizer
341     ## is switched to the |ENTITY_STATE|, which is an implementation
342     ## of the "consume a character reference" algorithm.
343     $self->{entity_add} = -1;
344     $self->{prev_state} = DATA_STATE;
345     $self->{state} = ENTITY_STATE;
346    
347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
348     $self->{line_prev} = $self->{line};
349     $self->{column_prev} = $self->{column};
350     $self->{column}++;
351     $self->{nc}
352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
353     } else {
354     $self->{set_nc}->($self);
355     }
356    
357     redo A;
358     } elsif ($self->{nc} == 0x003C) { # <
359    
360     $self->{state} = TAG_OPEN_STATE;
361    
362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
363     $self->{line_prev} = $self->{line};
364     $self->{column_prev} = $self->{column};
365     $self->{column}++;
366     $self->{nc}
367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
368     } else {
369     $self->{set_nc}->($self);
370     }
371    
372     redo A;
373     } elsif ($self->{nc} == -1) {
374    
375     return ({type => END_OF_FILE_TOKEN,
376     line => $self->{line}, column => $self->{column}});
377     last A; ## TODO: ok?
378     } else {
379    
380     #
381     }
382    
383     # Anything else
384     my $token = {type => CHARACTER_TOKEN,
385     data => chr $self->{nc},
386     line => $self->{line}, column => $self->{column},
387     };
388     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
389    
390     ## Stay in the state.
391    
392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
393     $self->{line_prev} = $self->{line};
394     $self->{column_prev} = $self->{column};
395     $self->{column}++;
396     $self->{nc}
397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
398     } else {
399     $self->{set_nc}->($self);
400     }
401    
402     return ($token);
403     redo A;
404     } elsif ($self->{state} == DATA_STATE) {
405     $self->{s_kwd} = '' unless defined $self->{s_kwd};
406     if ($self->{nc} == 0x0026) { # &
407     $self->{s_kwd} = '';
408     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
409     not $self->{escape}) {
410    
411     ## NOTE: In the spec, the tokenizer is switched to the
412     ## "entity data state". In this implementation, the tokenizer
413     ## is switched to the |ENTITY_STATE|, which is an implementation
414     ## of the "consume a character reference" algorithm.
415     $self->{entity_add} = -1;
416     $self->{prev_state} = DATA_STATE;
417     $self->{state} = ENTITY_STATE;
418    
419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
420     $self->{line_prev} = $self->{line};
421     $self->{column_prev} = $self->{column};
422     $self->{column}++;
423     $self->{nc}
424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
425     } else {
426     $self->{set_nc}->($self);
427     }
428    
429     redo A;
430     } else {
431    
432     #
433     }
434     } elsif ($self->{nc} == 0x002D) { # -
435     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
436 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
437 wakaba 1.1
438     $self->{escape} = 1; # unless $self->{escape};
439     $self->{s_kwd} = '--';
440     #
441 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
442 wakaba 1.1
443     $self->{s_kwd} = '--';
444     #
445 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
446    
447     $self->{s_kwd} .= '-';
448     #
449 wakaba 1.1 } else {
450    
451 wakaba 1.5 $self->{s_kwd} = '-';
452 wakaba 1.1 #
453     }
454     }
455    
456     #
457     } elsif ($self->{nc} == 0x0021) { # !
458     if (length $self->{s_kwd}) {
459    
460     $self->{s_kwd} .= '!';
461     #
462     } else {
463    
464     #$self->{s_kwd} = '';
465     #
466     }
467     #
468     } elsif ($self->{nc} == 0x003C) { # <
469     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
470     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
471     not $self->{escape})) {
472    
473     $self->{state} = TAG_OPEN_STATE;
474    
475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
476     $self->{line_prev} = $self->{line};
477     $self->{column_prev} = $self->{column};
478     $self->{column}++;
479     $self->{nc}
480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
481     } else {
482     $self->{set_nc}->($self);
483     }
484    
485     redo A;
486     } else {
487    
488     $self->{s_kwd} = '';
489     #
490     }
491     } elsif ($self->{nc} == 0x003E) { # >
492     if ($self->{escape} and
493     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
494     if ($self->{s_kwd} eq '--') {
495    
496     delete $self->{escape};
497 wakaba 1.5 #
498 wakaba 1.1 } else {
499    
500 wakaba 1.5 #
501 wakaba 1.1 }
502 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
503    
504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
505     line => $self->{line_prev},
506     column => $self->{column_prev} - 1);
507     #
508 wakaba 1.1 } else {
509    
510 wakaba 1.5 #
511 wakaba 1.1 }
512    
513     $self->{s_kwd} = '';
514     #
515 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
516     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
517    
518     $self->{s_kwd} .= ']';
519     } elsif ($self->{s_kwd} eq ']]') {
520    
521     #
522     } else {
523    
524     $self->{s_kwd} = '';
525     }
526     #
527 wakaba 1.1 } elsif ($self->{nc} == -1) {
528    
529     $self->{s_kwd} = '';
530     return ({type => END_OF_FILE_TOKEN,
531     line => $self->{line}, column => $self->{column}});
532     last A; ## TODO: ok?
533     } else {
534    
535     $self->{s_kwd} = '';
536     #
537     }
538    
539     # Anything else
540     my $token = {type => CHARACTER_TOKEN,
541     data => chr $self->{nc},
542     line => $self->{line}, column => $self->{column},
543     };
544 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
545 wakaba 1.1 length $token->{data})) {
546     $self->{s_kwd} = '';
547     }
548    
549     ## Stay in the data state.
550 wakaba 1.5 if (not $self->{is_xml} and
551     $self->{content_model} == PCDATA_CONTENT_MODEL) {
552 wakaba 1.1
553     $self->{state} = PCDATA_STATE;
554     } else {
555    
556     ## Stay in the state.
557     }
558    
559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
560     $self->{line_prev} = $self->{line};
561     $self->{column_prev} = $self->{column};
562     $self->{column}++;
563     $self->{nc}
564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
565     } else {
566     $self->{set_nc}->($self);
567     }
568    
569     return ($token);
570     redo A;
571     } elsif ($self->{state} == TAG_OPEN_STATE) {
572 wakaba 1.10 ## XML5: "tag state".
573    
574 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
575     if ($self->{nc} == 0x002F) { # /
576    
577    
578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
579     $self->{line_prev} = $self->{line};
580     $self->{column_prev} = $self->{column};
581     $self->{column}++;
582     $self->{nc}
583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
584     } else {
585     $self->{set_nc}->($self);
586     }
587    
588     $self->{state} = CLOSE_TAG_OPEN_STATE;
589     redo A;
590     } elsif ($self->{nc} == 0x0021) { # !
591    
592 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
593 wakaba 1.1 #
594     } else {
595    
596 wakaba 1.12 $self->{s_kwd} = '';
597 wakaba 1.1 #
598     }
599    
600     ## reconsume
601     $self->{state} = DATA_STATE;
602     return ({type => CHARACTER_TOKEN, data => '<',
603     line => $self->{line_prev},
604     column => $self->{column_prev},
605     });
606     redo A;
607     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
608     if ($self->{nc} == 0x0021) { # !
609    
610     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
611    
612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
613     $self->{line_prev} = $self->{line};
614     $self->{column_prev} = $self->{column};
615     $self->{column}++;
616     $self->{nc}
617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
618     } else {
619     $self->{set_nc}->($self);
620     }
621    
622     redo A;
623     } elsif ($self->{nc} == 0x002F) { # /
624    
625     $self->{state} = CLOSE_TAG_OPEN_STATE;
626    
627     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
628     $self->{line_prev} = $self->{line};
629     $self->{column_prev} = $self->{column};
630     $self->{column}++;
631     $self->{nc}
632     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
633     } else {
634     $self->{set_nc}->($self);
635     }
636    
637     redo A;
638     } elsif (0x0041 <= $self->{nc} and
639     $self->{nc} <= 0x005A) { # A..Z
640    
641     $self->{ct}
642     = {type => START_TAG_TOKEN,
643 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
644 wakaba 1.1 line => $self->{line_prev},
645     column => $self->{column_prev}};
646     $self->{state} = TAG_NAME_STATE;
647    
648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
649     $self->{line_prev} = $self->{line};
650     $self->{column_prev} = $self->{column};
651     $self->{column}++;
652     $self->{nc}
653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
654     } else {
655     $self->{set_nc}->($self);
656     }
657    
658     redo A;
659     } elsif (0x0061 <= $self->{nc} and
660     $self->{nc} <= 0x007A) { # a..z
661    
662     $self->{ct} = {type => START_TAG_TOKEN,
663     tag_name => chr ($self->{nc}),
664     line => $self->{line_prev},
665     column => $self->{column_prev}};
666     $self->{state} = TAG_NAME_STATE;
667    
668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
669     $self->{line_prev} = $self->{line};
670     $self->{column_prev} = $self->{column};
671     $self->{column}++;
672     $self->{nc}
673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
674     } else {
675     $self->{set_nc}->($self);
676     }
677    
678     redo A;
679     } elsif ($self->{nc} == 0x003E) { # >
680    
681     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
682     line => $self->{line_prev},
683     column => $self->{column_prev});
684     $self->{state} = DATA_STATE;
685 wakaba 1.5 $self->{s_kwd} = '';
686 wakaba 1.1
687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
688     $self->{line_prev} = $self->{line};
689     $self->{column_prev} = $self->{column};
690     $self->{column}++;
691     $self->{nc}
692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
693     } else {
694     $self->{set_nc}->($self);
695     }
696    
697    
698     return ({type => CHARACTER_TOKEN, data => '<>',
699     line => $self->{line_prev},
700     column => $self->{column_prev},
701     });
702    
703     redo A;
704     } elsif ($self->{nc} == 0x003F) { # ?
705 wakaba 1.8 if ($self->{is_xml}) {
706    
707     $self->{state} = PI_STATE;
708    
709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
710     $self->{line_prev} = $self->{line};
711     $self->{column_prev} = $self->{column};
712     $self->{column}++;
713     $self->{nc}
714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
715     } else {
716     $self->{set_nc}->($self);
717     }
718    
719     redo A;
720     } else {
721    
722     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
723     line => $self->{line_prev},
724     column => $self->{column_prev});
725     $self->{state} = BOGUS_COMMENT_STATE;
726     $self->{ct} = {type => COMMENT_TOKEN, data => '',
727     line => $self->{line_prev},
728     column => $self->{column_prev},
729     };
730     ## $self->{nc} is intentionally left as is
731     redo A;
732     }
733 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
734 wakaba 1.1
735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
736     line => $self->{line_prev},
737     column => $self->{column_prev});
738     $self->{state} = DATA_STATE;
739 wakaba 1.5 $self->{s_kwd} = '';
740 wakaba 1.1 ## reconsume
741    
742     return ({type => CHARACTER_TOKEN, data => '<',
743     line => $self->{line_prev},
744     column => $self->{column_prev},
745     });
746    
747     redo A;
748 wakaba 1.9 } else {
749     ## XML5: "<:" is a parse error.
750    
751     $self->{ct} = {type => START_TAG_TOKEN,
752     tag_name => chr ($self->{nc}),
753     line => $self->{line_prev},
754     column => $self->{column_prev}};
755     $self->{state} = TAG_NAME_STATE;
756    
757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
758     $self->{line_prev} = $self->{line};
759     $self->{column_prev} = $self->{column};
760     $self->{column}++;
761     $self->{nc}
762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
763     } else {
764     $self->{set_nc}->($self);
765     }
766    
767     redo A;
768 wakaba 1.1 }
769     } else {
770     die "$0: $self->{content_model} in tag open";
771     }
772     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
773     ## NOTE: The "close tag open state" in the spec is implemented as
774     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
775    
776 wakaba 1.10 ## XML5: "end tag state".
777    
778 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
779     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
780     if (defined $self->{last_stag_name}) {
781     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
782 wakaba 1.12 $self->{kwd} = '';
783 wakaba 1.1 ## Reconsume.
784     redo A;
785     } else {
786     ## No start tag token has ever been emitted
787     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
788    
789     $self->{state} = DATA_STATE;
790 wakaba 1.5 $self->{s_kwd} = '';
791 wakaba 1.1 ## Reconsume.
792     return ({type => CHARACTER_TOKEN, data => '</',
793     line => $l, column => $c,
794     });
795     redo A;
796     }
797     }
798    
799     if (0x0041 <= $self->{nc} and
800     $self->{nc} <= 0x005A) { # A..Z
801    
802     $self->{ct}
803     = {type => END_TAG_TOKEN,
804 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
805 wakaba 1.1 line => $l, column => $c};
806     $self->{state} = TAG_NAME_STATE;
807    
808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
809     $self->{line_prev} = $self->{line};
810     $self->{column_prev} = $self->{column};
811     $self->{column}++;
812     $self->{nc}
813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
814     } else {
815     $self->{set_nc}->($self);
816     }
817    
818     redo A;
819     } elsif (0x0061 <= $self->{nc} and
820     $self->{nc} <= 0x007A) { # a..z
821    
822     $self->{ct} = {type => END_TAG_TOKEN,
823     tag_name => chr ($self->{nc}),
824     line => $l, column => $c};
825     $self->{state} = TAG_NAME_STATE;
826    
827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
828     $self->{line_prev} = $self->{line};
829     $self->{column_prev} = $self->{column};
830     $self->{column}++;
831     $self->{nc}
832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
833     } else {
834     $self->{set_nc}->($self);
835     }
836    
837     redo A;
838     } elsif ($self->{nc} == 0x003E) { # >
839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
840     line => $self->{line_prev}, ## "<" in "</>"
841     column => $self->{column_prev} - 1);
842     $self->{state} = DATA_STATE;
843 wakaba 1.5 $self->{s_kwd} = '';
844 wakaba 1.10 if ($self->{is_xml}) {
845    
846     ## XML5: No parse error.
847    
848     ## NOTE: This parser raises a parse error, since it supports
849     ## XML1, not XML5.
850    
851     ## NOTE: A short end tag token.
852     my $ct = {type => END_TAG_TOKEN,
853     tag_name => '',
854     line => $self->{line_prev},
855     column => $self->{column_prev} - 1,
856     };
857    
858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
859     $self->{line_prev} = $self->{line};
860     $self->{column_prev} = $self->{column};
861     $self->{column}++;
862     $self->{nc}
863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
864     } else {
865     $self->{set_nc}->($self);
866     }
867    
868     return ($ct);
869     } else {
870    
871    
872 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
873     $self->{line_prev} = $self->{line};
874     $self->{column_prev} = $self->{column};
875     $self->{column}++;
876     $self->{nc}
877     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
878     } else {
879     $self->{set_nc}->($self);
880     }
881    
882 wakaba 1.10 }
883 wakaba 1.1 redo A;
884     } elsif ($self->{nc} == -1) {
885    
886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
887 wakaba 1.5 $self->{s_kwd} = '';
888 wakaba 1.1 $self->{state} = DATA_STATE;
889     # reconsume
890    
891     return ({type => CHARACTER_TOKEN, data => '</',
892     line => $l, column => $c,
893     });
894    
895     redo A;
896 wakaba 1.10 } elsif (not $self->{is_xml} or
897     $is_space->{$self->{nc}}) {
898 wakaba 1.1
899 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
900     line => $self->{line_prev}, # "<" of "</"
901     column => $self->{column_prev} - 1);
902 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
903     $self->{ct} = {type => COMMENT_TOKEN, data => '',
904     line => $self->{line_prev}, # "<" of "</"
905     column => $self->{column_prev} - 1,
906     };
907     ## NOTE: $self->{nc} is intentionally left as is.
908     ## Although the "anything else" case of the spec not explicitly
909     ## states that the next input character is to be reconsumed,
910     ## it will be included to the |data| of the comment token
911     ## generated from the bogus end tag, as defined in the
912     ## "bogus comment state" entry.
913     redo A;
914 wakaba 1.10 } else {
915     ## XML5: "</:" is a parse error.
916    
917     $self->{ct} = {type => END_TAG_TOKEN,
918     tag_name => chr ($self->{nc}),
919     line => $l, column => $c};
920     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
921    
922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
923     $self->{line_prev} = $self->{line};
924     $self->{column_prev} = $self->{column};
925     $self->{column}++;
926     $self->{nc}
927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
928     } else {
929     $self->{set_nc}->($self);
930     }
931    
932     redo A;
933 wakaba 1.1 }
934     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
935 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
936 wakaba 1.1 if (length $ch) {
937     my $CH = $ch;
938     $ch =~ tr/a-z/A-Z/;
939     my $nch = chr $self->{nc};
940     if ($nch eq $ch or $nch eq $CH) {
941    
942     ## Stay in the state.
943 wakaba 1.12 $self->{kwd} .= $nch;
944 wakaba 1.1
945     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
946     $self->{line_prev} = $self->{line};
947     $self->{column_prev} = $self->{column};
948     $self->{column}++;
949     $self->{nc}
950     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
951     } else {
952     $self->{set_nc}->($self);
953     }
954    
955     redo A;
956     } else {
957    
958     $self->{state} = DATA_STATE;
959 wakaba 1.5 $self->{s_kwd} = '';
960 wakaba 1.1 ## Reconsume.
961     return ({type => CHARACTER_TOKEN,
962 wakaba 1.12 data => '</' . $self->{kwd},
963 wakaba 1.1 line => $self->{line_prev},
964 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
965 wakaba 1.1 });
966     redo A;
967     }
968     } else { # after "<{tag-name}"
969     unless ($is_space->{$self->{nc}} or
970     {
971     0x003E => 1, # >
972     0x002F => 1, # /
973     -1 => 1, # EOF
974     }->{$self->{nc}}) {
975    
976     ## Reconsume.
977     $self->{state} = DATA_STATE;
978 wakaba 1.5 $self->{s_kwd} = '';
979 wakaba 1.1 return ({type => CHARACTER_TOKEN,
980 wakaba 1.12 data => '</' . $self->{kwd},
981 wakaba 1.1 line => $self->{line_prev},
982 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
983 wakaba 1.1 });
984     redo A;
985     } else {
986    
987     $self->{ct}
988     = {type => END_TAG_TOKEN,
989     tag_name => $self->{last_stag_name},
990     line => $self->{line_prev},
991 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
992 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
993     ## Reconsume.
994     redo A;
995     }
996     }
997     } elsif ($self->{state} == TAG_NAME_STATE) {
998     if ($is_space->{$self->{nc}}) {
999    
1000     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1001    
1002     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1003     $self->{line_prev} = $self->{line};
1004     $self->{column_prev} = $self->{column};
1005     $self->{column}++;
1006     $self->{nc}
1007     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1008     } else {
1009     $self->{set_nc}->($self);
1010     }
1011    
1012     redo A;
1013     } elsif ($self->{nc} == 0x003E) { # >
1014     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1015    
1016     $self->{last_stag_name} = $self->{ct}->{tag_name};
1017     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1018     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1019     #if ($self->{ct}->{attributes}) {
1020     # ## NOTE: This should never be reached.
1021     # !!! cp (36);
1022     # !!! parse-error (type => 'end tag attribute');
1023     #} else {
1024    
1025     #}
1026     } else {
1027     die "$0: $self->{ct}->{type}: Unknown token type";
1028     }
1029     $self->{state} = DATA_STATE;
1030 wakaba 1.5 $self->{s_kwd} = '';
1031 wakaba 1.1
1032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1033     $self->{line_prev} = $self->{line};
1034     $self->{column_prev} = $self->{column};
1035     $self->{column}++;
1036     $self->{nc}
1037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1038     } else {
1039     $self->{set_nc}->($self);
1040     }
1041    
1042    
1043     return ($self->{ct}); # start tag or end tag
1044    
1045     redo A;
1046     } elsif (0x0041 <= $self->{nc} and
1047     $self->{nc} <= 0x005A) { # A..Z
1048    
1049 wakaba 1.4 $self->{ct}->{tag_name}
1050     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1051 wakaba 1.1 # start tag or end tag
1052     ## Stay in this state
1053    
1054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1055     $self->{line_prev} = $self->{line};
1056     $self->{column_prev} = $self->{column};
1057     $self->{column}++;
1058     $self->{nc}
1059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1060     } else {
1061     $self->{set_nc}->($self);
1062     }
1063    
1064     redo A;
1065     } elsif ($self->{nc} == -1) {
1066     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1067     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1068    
1069     $self->{last_stag_name} = $self->{ct}->{tag_name};
1070     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1071     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1072     #if ($self->{ct}->{attributes}) {
1073     # ## NOTE: This state should never be reached.
1074     # !!! cp (40);
1075     # !!! parse-error (type => 'end tag attribute');
1076     #} else {
1077    
1078     #}
1079     } else {
1080     die "$0: $self->{ct}->{type}: Unknown token type";
1081     }
1082     $self->{state} = DATA_STATE;
1083 wakaba 1.5 $self->{s_kwd} = '';
1084 wakaba 1.1 # reconsume
1085    
1086     return ($self->{ct}); # start tag or end tag
1087    
1088     redo A;
1089     } elsif ($self->{nc} == 0x002F) { # /
1090    
1091     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1092    
1093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1094     $self->{line_prev} = $self->{line};
1095     $self->{column_prev} = $self->{column};
1096     $self->{column}++;
1097     $self->{nc}
1098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1099     } else {
1100     $self->{set_nc}->($self);
1101     }
1102    
1103     redo A;
1104     } else {
1105    
1106     $self->{ct}->{tag_name} .= chr $self->{nc};
1107     # start tag or end tag
1108     ## Stay in the state
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     }
1122     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1123 wakaba 1.11 ## XML5: "Tag attribute name before state".
1124    
1125 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1126    
1127     ## Stay in the state
1128    
1129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1130     $self->{line_prev} = $self->{line};
1131     $self->{column_prev} = $self->{column};
1132     $self->{column}++;
1133     $self->{nc}
1134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1135     } else {
1136     $self->{set_nc}->($self);
1137     }
1138    
1139     redo A;
1140     } elsif ($self->{nc} == 0x003E) { # >
1141     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1142    
1143     $self->{last_stag_name} = $self->{ct}->{tag_name};
1144     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1145     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1146     if ($self->{ct}->{attributes}) {
1147    
1148     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1149     } else {
1150    
1151     }
1152     } else {
1153     die "$0: $self->{ct}->{type}: Unknown token type";
1154     }
1155     $self->{state} = DATA_STATE;
1156 wakaba 1.5 $self->{s_kwd} = '';
1157 wakaba 1.1
1158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1159     $self->{line_prev} = $self->{line};
1160     $self->{column_prev} = $self->{column};
1161     $self->{column}++;
1162     $self->{nc}
1163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1164     } else {
1165     $self->{set_nc}->($self);
1166     }
1167    
1168    
1169     return ($self->{ct}); # start tag or end tag
1170    
1171     redo A;
1172     } elsif (0x0041 <= $self->{nc} and
1173     $self->{nc} <= 0x005A) { # A..Z
1174    
1175     $self->{ca}
1176 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1177 wakaba 1.1 value => '',
1178     line => $self->{line}, column => $self->{column}};
1179     $self->{state} = ATTRIBUTE_NAME_STATE;
1180    
1181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1182     $self->{line_prev} = $self->{line};
1183     $self->{column_prev} = $self->{column};
1184     $self->{column}++;
1185     $self->{nc}
1186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1187     } else {
1188     $self->{set_nc}->($self);
1189     }
1190    
1191     redo A;
1192     } elsif ($self->{nc} == 0x002F) { # /
1193    
1194     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1195    
1196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1197     $self->{line_prev} = $self->{line};
1198     $self->{column_prev} = $self->{column};
1199     $self->{column}++;
1200     $self->{nc}
1201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1202     } else {
1203     $self->{set_nc}->($self);
1204     }
1205    
1206     redo A;
1207     } elsif ($self->{nc} == -1) {
1208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1209     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1210    
1211     $self->{last_stag_name} = $self->{ct}->{tag_name};
1212     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1213     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1214     if ($self->{ct}->{attributes}) {
1215    
1216     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1217     } else {
1218    
1219     }
1220     } else {
1221     die "$0: $self->{ct}->{type}: Unknown token type";
1222     }
1223     $self->{state} = DATA_STATE;
1224 wakaba 1.5 $self->{s_kwd} = '';
1225 wakaba 1.1 # reconsume
1226    
1227     return ($self->{ct}); # start tag or end tag
1228    
1229     redo A;
1230     } else {
1231     if ({
1232     0x0022 => 1, # "
1233     0x0027 => 1, # '
1234     0x003D => 1, # =
1235     }->{$self->{nc}}) {
1236    
1237 wakaba 1.11 ## XML5: Not a parse error.
1238 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1239     } else {
1240    
1241 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1242 wakaba 1.1 }
1243     $self->{ca}
1244     = {name => chr ($self->{nc}),
1245     value => '',
1246     line => $self->{line}, column => $self->{column}};
1247     $self->{state} = ATTRIBUTE_NAME_STATE;
1248    
1249     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1250     $self->{line_prev} = $self->{line};
1251     $self->{column_prev} = $self->{column};
1252     $self->{column}++;
1253     $self->{nc}
1254     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1255     } else {
1256     $self->{set_nc}->($self);
1257     }
1258    
1259     redo A;
1260     }
1261     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1262 wakaba 1.11 ## XML5: "Tag attribute name state".
1263    
1264 wakaba 1.1 my $before_leave = sub {
1265     if (exists $self->{ct}->{attributes} # start tag or end tag
1266     ->{$self->{ca}->{name}}) { # MUST
1267    
1268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1269     ## Discard $self->{ca} # MUST
1270     } else {
1271    
1272     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1273     = $self->{ca};
1274 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1275 wakaba 1.1 }
1276     }; # $before_leave
1277    
1278     if ($is_space->{$self->{nc}}) {
1279    
1280     $before_leave->();
1281     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1282    
1283     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1284     $self->{line_prev} = $self->{line};
1285     $self->{column_prev} = $self->{column};
1286     $self->{column}++;
1287     $self->{nc}
1288     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1289     } else {
1290     $self->{set_nc}->($self);
1291     }
1292    
1293     redo A;
1294     } elsif ($self->{nc} == 0x003D) { # =
1295    
1296     $before_leave->();
1297     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1298    
1299     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1300     $self->{line_prev} = $self->{line};
1301     $self->{column_prev} = $self->{column};
1302     $self->{column}++;
1303     $self->{nc}
1304     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1305     } else {
1306     $self->{set_nc}->($self);
1307     }
1308    
1309     redo A;
1310     } elsif ($self->{nc} == 0x003E) { # >
1311 wakaba 1.11 if ($self->{is_xml}) {
1312    
1313     ## XML5: Not a parse error.
1314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1315     } else {
1316    
1317     }
1318    
1319 wakaba 1.1 $before_leave->();
1320     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321    
1322     $self->{last_stag_name} = $self->{ct}->{tag_name};
1323     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324    
1325     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1326     if ($self->{ct}->{attributes}) {
1327     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1328     }
1329     } else {
1330     die "$0: $self->{ct}->{type}: Unknown token type";
1331     }
1332     $self->{state} = DATA_STATE;
1333 wakaba 1.5 $self->{s_kwd} = '';
1334 wakaba 1.1
1335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1336     $self->{line_prev} = $self->{line};
1337     $self->{column_prev} = $self->{column};
1338     $self->{column}++;
1339     $self->{nc}
1340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1341     } else {
1342     $self->{set_nc}->($self);
1343     }
1344    
1345    
1346     return ($self->{ct}); # start tag or end tag
1347    
1348     redo A;
1349     } elsif (0x0041 <= $self->{nc} and
1350     $self->{nc} <= 0x005A) { # A..Z
1351    
1352 wakaba 1.4 $self->{ca}->{name}
1353     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1354 wakaba 1.1 ## Stay in the state
1355    
1356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1357     $self->{line_prev} = $self->{line};
1358     $self->{column_prev} = $self->{column};
1359     $self->{column}++;
1360     $self->{nc}
1361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1362     } else {
1363     $self->{set_nc}->($self);
1364     }
1365    
1366     redo A;
1367     } elsif ($self->{nc} == 0x002F) { # /
1368 wakaba 1.11 if ($self->{is_xml}) {
1369    
1370     ## XML5: Not a parse error.
1371     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1372     } else {
1373    
1374     }
1375 wakaba 1.1
1376     $before_leave->();
1377     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1378    
1379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1380     $self->{line_prev} = $self->{line};
1381     $self->{column_prev} = $self->{column};
1382     $self->{column}++;
1383     $self->{nc}
1384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1385     } else {
1386     $self->{set_nc}->($self);
1387     }
1388    
1389     redo A;
1390     } elsif ($self->{nc} == -1) {
1391     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1392     $before_leave->();
1393     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1394    
1395     $self->{last_stag_name} = $self->{ct}->{tag_name};
1396     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1397     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1398     if ($self->{ct}->{attributes}) {
1399    
1400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1401     } else {
1402     ## NOTE: This state should never be reached.
1403    
1404     }
1405     } else {
1406     die "$0: $self->{ct}->{type}: Unknown token type";
1407     }
1408     $self->{state} = DATA_STATE;
1409 wakaba 1.5 $self->{s_kwd} = '';
1410 wakaba 1.1 # reconsume
1411    
1412     return ($self->{ct}); # start tag or end tag
1413    
1414     redo A;
1415     } else {
1416     if ($self->{nc} == 0x0022 or # "
1417     $self->{nc} == 0x0027) { # '
1418    
1419 wakaba 1.11 ## XML5: Not a parse error.
1420 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1421     } else {
1422    
1423     }
1424     $self->{ca}->{name} .= chr ($self->{nc});
1425     ## Stay in the state
1426    
1427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1428     $self->{line_prev} = $self->{line};
1429     $self->{column_prev} = $self->{column};
1430     $self->{column}++;
1431     $self->{nc}
1432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1433     } else {
1434     $self->{set_nc}->($self);
1435     }
1436    
1437     redo A;
1438     }
1439     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1440 wakaba 1.11 ## XML5: "Tag attribute name after state".
1441    
1442 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1443    
1444     ## Stay in the state
1445    
1446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1447     $self->{line_prev} = $self->{line};
1448     $self->{column_prev} = $self->{column};
1449     $self->{column}++;
1450     $self->{nc}
1451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1452     } else {
1453     $self->{set_nc}->($self);
1454     }
1455    
1456     redo A;
1457     } elsif ($self->{nc} == 0x003D) { # =
1458    
1459     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1460    
1461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1462     $self->{line_prev} = $self->{line};
1463     $self->{column_prev} = $self->{column};
1464     $self->{column}++;
1465     $self->{nc}
1466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1467     } else {
1468     $self->{set_nc}->($self);
1469     }
1470    
1471     redo A;
1472     } elsif ($self->{nc} == 0x003E) { # >
1473 wakaba 1.11 if ($self->{is_xml}) {
1474    
1475     ## XML5: Not a parse error.
1476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1477     } else {
1478    
1479     }
1480    
1481 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1482    
1483     $self->{last_stag_name} = $self->{ct}->{tag_name};
1484     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1485     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1486     if ($self->{ct}->{attributes}) {
1487    
1488     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1489     } else {
1490     ## NOTE: This state should never be reached.
1491    
1492     }
1493     } else {
1494     die "$0: $self->{ct}->{type}: Unknown token type";
1495     }
1496     $self->{state} = DATA_STATE;
1497 wakaba 1.5 $self->{s_kwd} = '';
1498 wakaba 1.1
1499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1500     $self->{line_prev} = $self->{line};
1501     $self->{column_prev} = $self->{column};
1502     $self->{column}++;
1503     $self->{nc}
1504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1505     } else {
1506     $self->{set_nc}->($self);
1507     }
1508    
1509    
1510     return ($self->{ct}); # start tag or end tag
1511    
1512     redo A;
1513     } elsif (0x0041 <= $self->{nc} and
1514     $self->{nc} <= 0x005A) { # A..Z
1515    
1516     $self->{ca}
1517 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1518 wakaba 1.1 value => '',
1519     line => $self->{line}, column => $self->{column}};
1520     $self->{state} = ATTRIBUTE_NAME_STATE;
1521    
1522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1523     $self->{line_prev} = $self->{line};
1524     $self->{column_prev} = $self->{column};
1525     $self->{column}++;
1526     $self->{nc}
1527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1528     } else {
1529     $self->{set_nc}->($self);
1530     }
1531    
1532     redo A;
1533     } elsif ($self->{nc} == 0x002F) { # /
1534 wakaba 1.11 if ($self->{is_xml}) {
1535    
1536     ## XML5: Not a parse error.
1537     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1538     } else {
1539    
1540     }
1541 wakaba 1.1
1542     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1543    
1544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1545     $self->{line_prev} = $self->{line};
1546     $self->{column_prev} = $self->{column};
1547     $self->{column}++;
1548     $self->{nc}
1549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1550     } else {
1551     $self->{set_nc}->($self);
1552     }
1553    
1554     redo A;
1555     } elsif ($self->{nc} == -1) {
1556     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1557     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1558    
1559     $self->{last_stag_name} = $self->{ct}->{tag_name};
1560     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1561     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1562     if ($self->{ct}->{attributes}) {
1563    
1564     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1565     } else {
1566     ## NOTE: This state should never be reached.
1567    
1568     }
1569     } else {
1570     die "$0: $self->{ct}->{type}: Unknown token type";
1571     }
1572 wakaba 1.5 $self->{s_kwd} = '';
1573 wakaba 1.1 $self->{state} = DATA_STATE;
1574     # reconsume
1575    
1576     return ($self->{ct}); # start tag or end tag
1577    
1578     redo A;
1579     } else {
1580 wakaba 1.11 if ($self->{is_xml}) {
1581    
1582     ## XML5: Not a parse error.
1583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1584     } else {
1585    
1586     }
1587    
1588 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1589     $self->{nc} == 0x0027) { # '
1590    
1591 wakaba 1.11 ## XML5: Not a parse error.
1592 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1593     } else {
1594    
1595     }
1596     $self->{ca}
1597     = {name => chr ($self->{nc}),
1598     value => '',
1599     line => $self->{line}, column => $self->{column}};
1600     $self->{state} = ATTRIBUTE_NAME_STATE;
1601    
1602     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1603     $self->{line_prev} = $self->{line};
1604     $self->{column_prev} = $self->{column};
1605     $self->{column}++;
1606     $self->{nc}
1607     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1608     } else {
1609     $self->{set_nc}->($self);
1610     }
1611    
1612     redo A;
1613     }
1614     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1615 wakaba 1.11 ## XML5: "Tag attribute value before state".
1616    
1617 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1618    
1619     ## Stay in the state
1620    
1621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1622     $self->{line_prev} = $self->{line};
1623     $self->{column_prev} = $self->{column};
1624     $self->{column}++;
1625     $self->{nc}
1626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1627     } else {
1628     $self->{set_nc}->($self);
1629     }
1630    
1631     redo A;
1632     } elsif ($self->{nc} == 0x0022) { # "
1633    
1634     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1635    
1636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1637     $self->{line_prev} = $self->{line};
1638     $self->{column_prev} = $self->{column};
1639     $self->{column}++;
1640     $self->{nc}
1641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1642     } else {
1643     $self->{set_nc}->($self);
1644     }
1645    
1646     redo A;
1647     } elsif ($self->{nc} == 0x0026) { # &
1648    
1649     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1650     ## reconsume
1651     redo A;
1652     } elsif ($self->{nc} == 0x0027) { # '
1653    
1654     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1655    
1656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1657     $self->{line_prev} = $self->{line};
1658     $self->{column_prev} = $self->{column};
1659     $self->{column}++;
1660     $self->{nc}
1661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1662     } else {
1663     $self->{set_nc}->($self);
1664     }
1665    
1666     redo A;
1667     } elsif ($self->{nc} == 0x003E) { # >
1668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1669     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1670    
1671     $self->{last_stag_name} = $self->{ct}->{tag_name};
1672     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1673     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1674     if ($self->{ct}->{attributes}) {
1675    
1676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1677     } else {
1678     ## NOTE: This state should never be reached.
1679    
1680     }
1681     } else {
1682     die "$0: $self->{ct}->{type}: Unknown token type";
1683     }
1684     $self->{state} = DATA_STATE;
1685 wakaba 1.5 $self->{s_kwd} = '';
1686 wakaba 1.1
1687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688     $self->{line_prev} = $self->{line};
1689     $self->{column_prev} = $self->{column};
1690     $self->{column}++;
1691     $self->{nc}
1692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693     } else {
1694     $self->{set_nc}->($self);
1695     }
1696    
1697    
1698     return ($self->{ct}); # start tag or end tag
1699    
1700     redo A;
1701     } elsif ($self->{nc} == -1) {
1702     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1703     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1704    
1705     $self->{last_stag_name} = $self->{ct}->{tag_name};
1706     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1707     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1708     if ($self->{ct}->{attributes}) {
1709    
1710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1711     } else {
1712     ## NOTE: This state should never be reached.
1713    
1714     }
1715     } else {
1716     die "$0: $self->{ct}->{type}: Unknown token type";
1717     }
1718     $self->{state} = DATA_STATE;
1719 wakaba 1.5 $self->{s_kwd} = '';
1720 wakaba 1.1 ## reconsume
1721    
1722     return ($self->{ct}); # start tag or end tag
1723    
1724     redo A;
1725     } else {
1726     if ($self->{nc} == 0x003D) { # =
1727    
1728 wakaba 1.11 ## XML5: Not a parse error.
1729 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1730 wakaba 1.11 } elsif ($self->{is_xml}) {
1731    
1732     ## XML5: No parse error.
1733     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1734 wakaba 1.1 } else {
1735    
1736     }
1737     $self->{ca}->{value} .= chr ($self->{nc});
1738     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1739    
1740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1741     $self->{line_prev} = $self->{line};
1742     $self->{column_prev} = $self->{column};
1743     $self->{column}++;
1744     $self->{nc}
1745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1746     } else {
1747     $self->{set_nc}->($self);
1748     }
1749    
1750     redo A;
1751     }
1752     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1753 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1754     ## ATTLIST attribute value double quoted state".
1755 wakaba 1.11
1756 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1757 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1758    
1759     ## XML5: "DOCTYPE ATTLIST name after state".
1760     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1761     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1762     } else {
1763    
1764     ## XML5: "Tag attribute name before state".
1765     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1766     }
1767 wakaba 1.1
1768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1769     $self->{line_prev} = $self->{line};
1770     $self->{column_prev} = $self->{column};
1771     $self->{column}++;
1772     $self->{nc}
1773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1774     } else {
1775     $self->{set_nc}->($self);
1776     }
1777    
1778     redo A;
1779     } elsif ($self->{nc} == 0x0026) { # &
1780    
1781 wakaba 1.11 ## XML5: Not defined yet.
1782    
1783 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1784     ## "entity in attribute value state". In this implementation, the
1785     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1786     ## implementation of the "consume a character reference" algorithm.
1787     $self->{prev_state} = $self->{state};
1788     $self->{entity_add} = 0x0022; # "
1789     $self->{state} = ENTITY_STATE;
1790    
1791     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1792     $self->{line_prev} = $self->{line};
1793     $self->{column_prev} = $self->{column};
1794     $self->{column}++;
1795     $self->{nc}
1796     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1797     } else {
1798     $self->{set_nc}->($self);
1799     }
1800    
1801     redo A;
1802     } elsif ($self->{nc} == -1) {
1803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1804     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1805    
1806     $self->{last_stag_name} = $self->{ct}->{tag_name};
1807 wakaba 1.15
1808     $self->{state} = DATA_STATE;
1809     $self->{s_kwd} = '';
1810     ## reconsume
1811     return ($self->{ct}); # start tag
1812     redo A;
1813 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1814     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1815     if ($self->{ct}->{attributes}) {
1816    
1817     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1818     } else {
1819     ## NOTE: This state should never be reached.
1820    
1821     }
1822 wakaba 1.15
1823     $self->{state} = DATA_STATE;
1824     $self->{s_kwd} = '';
1825     ## reconsume
1826     return ($self->{ct}); # end tag
1827     redo A;
1828     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1829     ## XML5: No parse error above; not defined yet.
1830     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1831     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1832     ## Reconsume.
1833     return ($self->{ct}); # ATTLIST
1834     redo A;
1835 wakaba 1.1 } else {
1836     die "$0: $self->{ct}->{type}: Unknown token type";
1837     }
1838     } else {
1839 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1840 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1841    
1842     ## XML5: Not a parse error.
1843     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1844     } else {
1845    
1846     }
1847 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1848     $self->{read_until}->($self->{ca}->{value},
1849 wakaba 1.11 q["&<],
1850 wakaba 1.1 length $self->{ca}->{value});
1851    
1852     ## Stay in the state
1853    
1854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1855     $self->{line_prev} = $self->{line};
1856     $self->{column_prev} = $self->{column};
1857     $self->{column}++;
1858     $self->{nc}
1859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1860     } else {
1861     $self->{set_nc}->($self);
1862     }
1863    
1864     redo A;
1865     }
1866     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1867 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1868     ## ATTLIST attribute value single quoted state".
1869 wakaba 1.11
1870 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1871 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1872    
1873     ## XML5: "DOCTYPE ATTLIST name after state".
1874     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1875     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1876     } else {
1877    
1878     ## XML5: "Before attribute name state" (sic).
1879     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1880     }
1881 wakaba 1.1
1882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1883     $self->{line_prev} = $self->{line};
1884     $self->{column_prev} = $self->{column};
1885     $self->{column}++;
1886     $self->{nc}
1887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1888     } else {
1889     $self->{set_nc}->($self);
1890     }
1891    
1892     redo A;
1893     } elsif ($self->{nc} == 0x0026) { # &
1894    
1895 wakaba 1.11 ## XML5: Not defined yet.
1896    
1897 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1898     ## "entity in attribute value state". In this implementation, the
1899     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1900     ## implementation of the "consume a character reference" algorithm.
1901     $self->{entity_add} = 0x0027; # '
1902     $self->{prev_state} = $self->{state};
1903     $self->{state} = ENTITY_STATE;
1904    
1905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1906     $self->{line_prev} = $self->{line};
1907     $self->{column_prev} = $self->{column};
1908     $self->{column}++;
1909     $self->{nc}
1910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1911     } else {
1912     $self->{set_nc}->($self);
1913     }
1914    
1915     redo A;
1916     } elsif ($self->{nc} == -1) {
1917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1918     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1919    
1920     $self->{last_stag_name} = $self->{ct}->{tag_name};
1921 wakaba 1.15
1922     $self->{state} = DATA_STATE;
1923     $self->{s_kwd} = '';
1924     ## reconsume
1925     return ($self->{ct}); # start tag
1926     redo A;
1927 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1928     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1929     if ($self->{ct}->{attributes}) {
1930    
1931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1932     } else {
1933     ## NOTE: This state should never be reached.
1934    
1935     }
1936 wakaba 1.15
1937     $self->{state} = DATA_STATE;
1938     $self->{s_kwd} = '';
1939     ## reconsume
1940     return ($self->{ct}); # end tag
1941     redo A;
1942     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1943     ## XML5: No parse error above; not defined yet.
1944     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1945     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1946     ## Reconsume.
1947     return ($self->{ct}); # ATTLIST
1948     redo A;
1949 wakaba 1.1 } else {
1950     die "$0: $self->{ct}->{type}: Unknown token type";
1951     }
1952     } else {
1953 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1954 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1955    
1956     ## XML5: Not a parse error.
1957     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1958     } else {
1959    
1960     }
1961 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1962     $self->{read_until}->($self->{ca}->{value},
1963 wakaba 1.11 q['&<],
1964 wakaba 1.1 length $self->{ca}->{value});
1965    
1966     ## Stay in the state
1967    
1968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1969     $self->{line_prev} = $self->{line};
1970     $self->{column_prev} = $self->{column};
1971     $self->{column}++;
1972     $self->{nc}
1973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1974     } else {
1975     $self->{set_nc}->($self);
1976     }
1977    
1978     redo A;
1979     }
1980     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1981 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1982    
1983 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1984 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1985    
1986     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1987     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1988     } else {
1989    
1990     ## XML5: "Tag attribute name before state".
1991     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1992     }
1993 wakaba 1.1
1994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1995     $self->{line_prev} = $self->{line};
1996     $self->{column_prev} = $self->{column};
1997     $self->{column}++;
1998     $self->{nc}
1999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2000     } else {
2001     $self->{set_nc}->($self);
2002     }
2003    
2004     redo A;
2005     } elsif ($self->{nc} == 0x0026) { # &
2006    
2007 wakaba 1.11
2008     ## XML5: Not defined yet.
2009    
2010 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2011     ## "entity in attribute value state". In this implementation, the
2012     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2013     ## implementation of the "consume a character reference" algorithm.
2014     $self->{entity_add} = -1;
2015     $self->{prev_state} = $self->{state};
2016     $self->{state} = ENTITY_STATE;
2017    
2018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2019     $self->{line_prev} = $self->{line};
2020     $self->{column_prev} = $self->{column};
2021     $self->{column}++;
2022     $self->{nc}
2023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2024     } else {
2025     $self->{set_nc}->($self);
2026     }
2027    
2028     redo A;
2029     } elsif ($self->{nc} == 0x003E) { # >
2030     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2031    
2032     $self->{last_stag_name} = $self->{ct}->{tag_name};
2033 wakaba 1.15
2034     $self->{state} = DATA_STATE;
2035     $self->{s_kwd} = '';
2036    
2037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2038     $self->{line_prev} = $self->{line};
2039     $self->{column_prev} = $self->{column};
2040     $self->{column}++;
2041     $self->{nc}
2042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2043     } else {
2044     $self->{set_nc}->($self);
2045     }
2046    
2047     return ($self->{ct}); # start tag
2048     redo A;
2049 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2050     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2051     if ($self->{ct}->{attributes}) {
2052    
2053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2054     } else {
2055     ## NOTE: This state should never be reached.
2056    
2057     }
2058 wakaba 1.15
2059     $self->{state} = DATA_STATE;
2060     $self->{s_kwd} = '';
2061    
2062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2063     $self->{line_prev} = $self->{line};
2064     $self->{column_prev} = $self->{column};
2065     $self->{column}++;
2066     $self->{nc}
2067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2068     } else {
2069     $self->{set_nc}->($self);
2070     }
2071    
2072     return ($self->{ct}); # end tag
2073     redo A;
2074     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2075     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2076     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2077    
2078 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079     $self->{line_prev} = $self->{line};
2080     $self->{column_prev} = $self->{column};
2081     $self->{column}++;
2082     $self->{nc}
2083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084     } else {
2085     $self->{set_nc}->($self);
2086     }
2087    
2088 wakaba 1.15 return ($self->{ct}); # ATTLIST
2089     redo A;
2090     } else {
2091     die "$0: $self->{ct}->{type}: Unknown token type";
2092     }
2093 wakaba 1.1 } elsif ($self->{nc} == -1) {
2094     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2095    
2096 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2097 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2098 wakaba 1.15
2099     $self->{state} = DATA_STATE;
2100     $self->{s_kwd} = '';
2101     ## reconsume
2102     return ($self->{ct}); # start tag
2103     redo A;
2104 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2105 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2106 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2107     if ($self->{ct}->{attributes}) {
2108    
2109     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2110     } else {
2111     ## NOTE: This state should never be reached.
2112    
2113     }
2114 wakaba 1.15
2115     $self->{state} = DATA_STATE;
2116     $self->{s_kwd} = '';
2117     ## reconsume
2118     return ($self->{ct}); # end tag
2119     redo A;
2120     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2122     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2123     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2124     ## Reconsume.
2125     return ($self->{ct}); # ATTLIST
2126     redo A;
2127 wakaba 1.1 } else {
2128     die "$0: $self->{ct}->{type}: Unknown token type";
2129     }
2130     } else {
2131     if ({
2132     0x0022 => 1, # "
2133     0x0027 => 1, # '
2134     0x003D => 1, # =
2135     }->{$self->{nc}}) {
2136    
2137 wakaba 1.11 ## XML5: Not a parse error.
2138 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2139     } else {
2140    
2141     }
2142     $self->{ca}->{value} .= chr ($self->{nc});
2143     $self->{read_until}->($self->{ca}->{value},
2144     q["'=& >],
2145     length $self->{ca}->{value});
2146    
2147     ## Stay in the state
2148    
2149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2150     $self->{line_prev} = $self->{line};
2151     $self->{column_prev} = $self->{column};
2152     $self->{column}++;
2153     $self->{nc}
2154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2155     } else {
2156     $self->{set_nc}->($self);
2157     }
2158    
2159     redo A;
2160     }
2161     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2162     if ($is_space->{$self->{nc}}) {
2163    
2164     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2165    
2166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167     $self->{line_prev} = $self->{line};
2168     $self->{column_prev} = $self->{column};
2169     $self->{column}++;
2170     $self->{nc}
2171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172     } else {
2173     $self->{set_nc}->($self);
2174     }
2175    
2176     redo A;
2177     } elsif ($self->{nc} == 0x003E) { # >
2178     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2179    
2180     $self->{last_stag_name} = $self->{ct}->{tag_name};
2181     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2182     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2183     if ($self->{ct}->{attributes}) {
2184    
2185     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2186     } else {
2187     ## NOTE: This state should never be reached.
2188    
2189     }
2190     } else {
2191     die "$0: $self->{ct}->{type}: Unknown token type";
2192     }
2193     $self->{state} = DATA_STATE;
2194 wakaba 1.5 $self->{s_kwd} = '';
2195 wakaba 1.1
2196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2197     $self->{line_prev} = $self->{line};
2198     $self->{column_prev} = $self->{column};
2199     $self->{column}++;
2200     $self->{nc}
2201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2202     } else {
2203     $self->{set_nc}->($self);
2204     }
2205    
2206    
2207     return ($self->{ct}); # start tag or end tag
2208    
2209     redo A;
2210     } elsif ($self->{nc} == 0x002F) { # /
2211    
2212     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2213    
2214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2215     $self->{line_prev} = $self->{line};
2216     $self->{column_prev} = $self->{column};
2217     $self->{column}++;
2218     $self->{nc}
2219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2220     } else {
2221     $self->{set_nc}->($self);
2222     }
2223    
2224     redo A;
2225     } elsif ($self->{nc} == -1) {
2226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2227     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2228    
2229     $self->{last_stag_name} = $self->{ct}->{tag_name};
2230     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2231     if ($self->{ct}->{attributes}) {
2232    
2233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2234     } else {
2235     ## NOTE: This state should never be reached.
2236    
2237     }
2238     } else {
2239     die "$0: $self->{ct}->{type}: Unknown token type";
2240     }
2241     $self->{state} = DATA_STATE;
2242 wakaba 1.5 $self->{s_kwd} = '';
2243 wakaba 1.1 ## Reconsume.
2244     return ($self->{ct}); # start tag or end tag
2245     redo A;
2246     } else {
2247    
2248     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2249     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2250     ## reconsume
2251     redo A;
2252     }
2253     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2254 wakaba 1.11 ## XML5: "Empty tag state".
2255    
2256 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2257     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2258    
2259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2260     ## TODO: Different type than slash in start tag
2261     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2262     if ($self->{ct}->{attributes}) {
2263    
2264     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2265     } else {
2266    
2267     }
2268     ## TODO: Test |<title></title/>|
2269     } else {
2270    
2271     $self->{self_closing} = 1;
2272     }
2273    
2274     $self->{state} = DATA_STATE;
2275 wakaba 1.5 $self->{s_kwd} = '';
2276 wakaba 1.1
2277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278     $self->{line_prev} = $self->{line};
2279     $self->{column_prev} = $self->{column};
2280     $self->{column}++;
2281     $self->{nc}
2282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2283     } else {
2284     $self->{set_nc}->($self);
2285     }
2286    
2287    
2288     return ($self->{ct}); # start tag or end tag
2289    
2290     redo A;
2291     } elsif ($self->{nc} == -1) {
2292     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2293     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2294    
2295     $self->{last_stag_name} = $self->{ct}->{tag_name};
2296     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2297     if ($self->{ct}->{attributes}) {
2298    
2299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2300     } else {
2301     ## NOTE: This state should never be reached.
2302    
2303     }
2304     } else {
2305     die "$0: $self->{ct}->{type}: Unknown token type";
2306     }
2307 wakaba 1.11 ## XML5: "Tag attribute name before state".
2308 wakaba 1.1 $self->{state} = DATA_STATE;
2309 wakaba 1.5 $self->{s_kwd} = '';
2310 wakaba 1.1 ## Reconsume.
2311     return ($self->{ct}); # start tag or end tag
2312     redo A;
2313     } else {
2314    
2315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2316     ## TODO: This error type is wrong.
2317     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2318     ## Reconsume.
2319     redo A;
2320     }
2321     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2322 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2323    
2324 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2325     ## consumes characters one-by-one basis.
2326    
2327     if ($self->{nc} == 0x003E) { # >
2328 wakaba 1.13 if ($self->{in_subset}) {
2329    
2330     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2331     } else {
2332    
2333     $self->{state} = DATA_STATE;
2334     $self->{s_kwd} = '';
2335     }
2336 wakaba 1.1
2337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2338     $self->{line_prev} = $self->{line};
2339     $self->{column_prev} = $self->{column};
2340     $self->{column}++;
2341     $self->{nc}
2342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2343     } else {
2344     $self->{set_nc}->($self);
2345     }
2346    
2347    
2348     return ($self->{ct}); # comment
2349     redo A;
2350     } elsif ($self->{nc} == -1) {
2351 wakaba 1.13 if ($self->{in_subset}) {
2352    
2353     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2354     } else {
2355    
2356     $self->{state} = DATA_STATE;
2357     $self->{s_kwd} = '';
2358     }
2359 wakaba 1.1 ## reconsume
2360    
2361     return ($self->{ct}); # comment
2362     redo A;
2363     } else {
2364    
2365     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2366     $self->{read_until}->($self->{ct}->{data},
2367     q[>],
2368     length $self->{ct}->{data});
2369    
2370     ## Stay in the state.
2371    
2372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2373     $self->{line_prev} = $self->{line};
2374     $self->{column_prev} = $self->{column};
2375     $self->{column}++;
2376     $self->{nc}
2377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2378     } else {
2379     $self->{set_nc}->($self);
2380     }
2381    
2382     redo A;
2383     }
2384     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2385 wakaba 1.14 ## XML5: "Markup declaration state".
2386 wakaba 1.1
2387     if ($self->{nc} == 0x002D) { # -
2388    
2389     $self->{state} = MD_HYPHEN_STATE;
2390    
2391     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2392     $self->{line_prev} = $self->{line};
2393     $self->{column_prev} = $self->{column};
2394     $self->{column}++;
2395     $self->{nc}
2396     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2397     } else {
2398     $self->{set_nc}->($self);
2399     }
2400    
2401     redo A;
2402     } elsif ($self->{nc} == 0x0044 or # D
2403     $self->{nc} == 0x0064) { # d
2404     ## ASCII case-insensitive.
2405    
2406     $self->{state} = MD_DOCTYPE_STATE;
2407 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2408 wakaba 1.1
2409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410     $self->{line_prev} = $self->{line};
2411     $self->{column_prev} = $self->{column};
2412     $self->{column}++;
2413     $self->{nc}
2414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415     } else {
2416     $self->{set_nc}->($self);
2417     }
2418    
2419     redo A;
2420 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2421     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2422     $self->{is_xml}) and
2423 wakaba 1.1 $self->{nc} == 0x005B) { # [
2424    
2425     $self->{state} = MD_CDATA_STATE;
2426 wakaba 1.12 $self->{kwd} = '[';
2427 wakaba 1.1
2428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2429     $self->{line_prev} = $self->{line};
2430     $self->{column_prev} = $self->{column};
2431     $self->{column}++;
2432     $self->{nc}
2433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2434     } else {
2435     $self->{set_nc}->($self);
2436     }
2437    
2438     redo A;
2439     } else {
2440    
2441     }
2442    
2443     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2444     line => $self->{line_prev},
2445     column => $self->{column_prev} - 1);
2446     ## Reconsume.
2447     $self->{state} = BOGUS_COMMENT_STATE;
2448     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2449     line => $self->{line_prev},
2450     column => $self->{column_prev} - 1,
2451     };
2452     redo A;
2453     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2454     if ($self->{nc} == 0x002D) { # -
2455    
2456     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2457     line => $self->{line_prev},
2458     column => $self->{column_prev} - 2,
2459     };
2460 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2461 wakaba 1.1
2462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2463     $self->{line_prev} = $self->{line};
2464     $self->{column_prev} = $self->{column};
2465     $self->{column}++;
2466     $self->{nc}
2467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2468     } else {
2469     $self->{set_nc}->($self);
2470     }
2471    
2472     redo A;
2473     } else {
2474    
2475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2476     line => $self->{line_prev},
2477     column => $self->{column_prev} - 2);
2478     $self->{state} = BOGUS_COMMENT_STATE;
2479     ## Reconsume.
2480     $self->{ct} = {type => COMMENT_TOKEN,
2481     data => '-',
2482     line => $self->{line_prev},
2483     column => $self->{column_prev} - 2,
2484     };
2485     redo A;
2486     }
2487     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2488     ## ASCII case-insensitive.
2489     if ($self->{nc} == [
2490     undef,
2491     0x004F, # O
2492     0x0043, # C
2493     0x0054, # T
2494     0x0059, # Y
2495     0x0050, # P
2496 wakaba 1.12 ]->[length $self->{kwd}] or
2497 wakaba 1.1 $self->{nc} == [
2498     undef,
2499     0x006F, # o
2500     0x0063, # c
2501     0x0074, # t
2502     0x0079, # y
2503     0x0070, # p
2504 wakaba 1.12 ]->[length $self->{kwd}]) {
2505 wakaba 1.1
2506     ## Stay in the state.
2507 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2508 wakaba 1.1
2509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2510     $self->{line_prev} = $self->{line};
2511     $self->{column_prev} = $self->{column};
2512     $self->{column}++;
2513     $self->{nc}
2514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2515     } else {
2516     $self->{set_nc}->($self);
2517     }
2518    
2519     redo A;
2520 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2521 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2522     $self->{nc} == 0x0065)) { # e
2523 wakaba 1.12 if ($self->{is_xml} and
2524     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2525 wakaba 1.10
2526     ## XML5: case-sensitive.
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2528     text => 'DOCTYPE',
2529     line => $self->{line_prev},
2530     column => $self->{column_prev} - 5);
2531     } else {
2532    
2533     }
2534 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2535     $self->{ct} = {type => DOCTYPE_TOKEN,
2536     quirks => 1,
2537     line => $self->{line_prev},
2538     column => $self->{column_prev} - 7,
2539     };
2540    
2541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2542     $self->{line_prev} = $self->{line};
2543     $self->{column_prev} = $self->{column};
2544     $self->{column}++;
2545     $self->{nc}
2546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2547     } else {
2548     $self->{set_nc}->($self);
2549     }
2550    
2551     redo A;
2552     } else {
2553    
2554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2555     line => $self->{line_prev},
2556 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2557 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2558     ## Reconsume.
2559     $self->{ct} = {type => COMMENT_TOKEN,
2560 wakaba 1.12 data => $self->{kwd},
2561 wakaba 1.1 line => $self->{line_prev},
2562 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2563 wakaba 1.1 };
2564     redo A;
2565     }
2566     } elsif ($self->{state} == MD_CDATA_STATE) {
2567     if ($self->{nc} == {
2568     '[' => 0x0043, # C
2569     '[C' => 0x0044, # D
2570     '[CD' => 0x0041, # A
2571     '[CDA' => 0x0054, # T
2572     '[CDAT' => 0x0041, # A
2573 wakaba 1.12 }->{$self->{kwd}}) {
2574 wakaba 1.1
2575     ## Stay in the state.
2576 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2577 wakaba 1.1
2578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2579     $self->{line_prev} = $self->{line};
2580     $self->{column_prev} = $self->{column};
2581     $self->{column}++;
2582     $self->{nc}
2583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2584     } else {
2585     $self->{set_nc}->($self);
2586     }
2587    
2588     redo A;
2589 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2590 wakaba 1.1 $self->{nc} == 0x005B) { # [
2591 wakaba 1.6 if ($self->{is_xml} and
2592     not $self->{tainted} and
2593     @{$self->{open_elements} or []} == 0) {
2594 wakaba 1.8
2595 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2596     line => $self->{line_prev},
2597     column => $self->{column_prev} - 7);
2598     $self->{tainted} = 1;
2599 wakaba 1.8 } else {
2600    
2601 wakaba 1.6 }
2602    
2603 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2604     data => '',
2605     line => $self->{line_prev},
2606     column => $self->{column_prev} - 7};
2607     $self->{state} = CDATA_SECTION_STATE;
2608    
2609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2610     $self->{line_prev} = $self->{line};
2611     $self->{column_prev} = $self->{column};
2612     $self->{column}++;
2613     $self->{nc}
2614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2615     } else {
2616     $self->{set_nc}->($self);
2617     }
2618    
2619     redo A;
2620     } else {
2621    
2622     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2623     line => $self->{line_prev},
2624 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2625 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2626     ## Reconsume.
2627     $self->{ct} = {type => COMMENT_TOKEN,
2628 wakaba 1.12 data => $self->{kwd},
2629 wakaba 1.1 line => $self->{line_prev},
2630 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2631 wakaba 1.1 };
2632     redo A;
2633     }
2634     } elsif ($self->{state} == COMMENT_START_STATE) {
2635     if ($self->{nc} == 0x002D) { # -
2636    
2637     $self->{state} = COMMENT_START_DASH_STATE;
2638    
2639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640     $self->{line_prev} = $self->{line};
2641     $self->{column_prev} = $self->{column};
2642     $self->{column}++;
2643     $self->{nc}
2644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2645     } else {
2646     $self->{set_nc}->($self);
2647     }
2648    
2649     redo A;
2650     } elsif ($self->{nc} == 0x003E) { # >
2651     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2652 wakaba 1.13 if ($self->{in_subset}) {
2653    
2654     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655     } else {
2656    
2657     $self->{state} = DATA_STATE;
2658     $self->{s_kwd} = '';
2659     }
2660 wakaba 1.1
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671    
2672     return ($self->{ct}); # comment
2673    
2674     redo A;
2675     } elsif ($self->{nc} == -1) {
2676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2677 wakaba 1.13 if ($self->{in_subset}) {
2678    
2679     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2680     } else {
2681    
2682     $self->{state} = DATA_STATE;
2683     $self->{s_kwd} = '';
2684     }
2685 wakaba 1.1 ## reconsume
2686    
2687     return ($self->{ct}); # comment
2688    
2689     redo A;
2690     } else {
2691    
2692     $self->{ct}->{data} # comment
2693     .= chr ($self->{nc});
2694     $self->{state} = COMMENT_STATE;
2695    
2696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2697     $self->{line_prev} = $self->{line};
2698     $self->{column_prev} = $self->{column};
2699     $self->{column}++;
2700     $self->{nc}
2701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2702     } else {
2703     $self->{set_nc}->($self);
2704     }
2705    
2706     redo A;
2707     }
2708     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2709     if ($self->{nc} == 0x002D) { # -
2710    
2711     $self->{state} = COMMENT_END_STATE;
2712    
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723     redo A;
2724     } elsif ($self->{nc} == 0x003E) { # >
2725     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2726 wakaba 1.13 if ($self->{in_subset}) {
2727    
2728     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2729     } else {
2730    
2731     $self->{state} = DATA_STATE;
2732     $self->{s_kwd} = '';
2733     }
2734 wakaba 1.1
2735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2736     $self->{line_prev} = $self->{line};
2737     $self->{column_prev} = $self->{column};
2738     $self->{column}++;
2739     $self->{nc}
2740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2741     } else {
2742     $self->{set_nc}->($self);
2743     }
2744    
2745    
2746     return ($self->{ct}); # comment
2747    
2748     redo A;
2749     } elsif ($self->{nc} == -1) {
2750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2751 wakaba 1.13 if ($self->{in_subset}) {
2752    
2753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2754     } else {
2755    
2756     $self->{state} = DATA_STATE;
2757     $self->{s_kwd} = '';
2758     }
2759 wakaba 1.1 ## reconsume
2760    
2761     return ($self->{ct}); # comment
2762    
2763     redo A;
2764     } else {
2765    
2766     $self->{ct}->{data} # comment
2767     .= '-' . chr ($self->{nc});
2768     $self->{state} = COMMENT_STATE;
2769    
2770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2771     $self->{line_prev} = $self->{line};
2772     $self->{column_prev} = $self->{column};
2773     $self->{column}++;
2774     $self->{nc}
2775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2776     } else {
2777     $self->{set_nc}->($self);
2778     }
2779    
2780     redo A;
2781     }
2782     } elsif ($self->{state} == COMMENT_STATE) {
2783 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2784    
2785 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2786    
2787     $self->{state} = COMMENT_END_DASH_STATE;
2788    
2789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2790     $self->{line_prev} = $self->{line};
2791     $self->{column_prev} = $self->{column};
2792     $self->{column}++;
2793     $self->{nc}
2794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2795     } else {
2796     $self->{set_nc}->($self);
2797     }
2798    
2799     redo A;
2800     } elsif ($self->{nc} == -1) {
2801     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2802 wakaba 1.13 if ($self->{in_subset}) {
2803    
2804     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2805     } else {
2806    
2807     $self->{state} = DATA_STATE;
2808     $self->{s_kwd} = '';
2809     }
2810 wakaba 1.1 ## reconsume
2811    
2812     return ($self->{ct}); # comment
2813    
2814     redo A;
2815     } else {
2816    
2817     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2818     $self->{read_until}->($self->{ct}->{data},
2819     q[-],
2820     length $self->{ct}->{data});
2821    
2822     ## Stay in the state
2823    
2824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2825     $self->{line_prev} = $self->{line};
2826     $self->{column_prev} = $self->{column};
2827     $self->{column}++;
2828     $self->{nc}
2829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2830     } else {
2831     $self->{set_nc}->($self);
2832     }
2833    
2834     redo A;
2835     }
2836     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2837 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2838 wakaba 1.10
2839 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2840    
2841     $self->{state} = COMMENT_END_STATE;
2842    
2843     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2844     $self->{line_prev} = $self->{line};
2845     $self->{column_prev} = $self->{column};
2846     $self->{column}++;
2847     $self->{nc}
2848     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2849     } else {
2850     $self->{set_nc}->($self);
2851     }
2852    
2853     redo A;
2854     } elsif ($self->{nc} == -1) {
2855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2856 wakaba 1.13 if ($self->{in_subset}) {
2857    
2858     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2859     } else {
2860    
2861     $self->{state} = DATA_STATE;
2862     $self->{s_kwd} = '';
2863     }
2864 wakaba 1.1 ## reconsume
2865    
2866     return ($self->{ct}); # comment
2867    
2868     redo A;
2869     } else {
2870    
2871     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2872     $self->{state} = COMMENT_STATE;
2873    
2874     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2875     $self->{line_prev} = $self->{line};
2876     $self->{column_prev} = $self->{column};
2877     $self->{column}++;
2878     $self->{nc}
2879     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2880     } else {
2881     $self->{set_nc}->($self);
2882     }
2883    
2884     redo A;
2885     }
2886     } elsif ($self->{state} == COMMENT_END_STATE) {
2887 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2888    
2889 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2890 wakaba 1.13 if ($self->{in_subset}) {
2891    
2892     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2893     } else {
2894    
2895     $self->{state} = DATA_STATE;
2896     $self->{s_kwd} = '';
2897     }
2898 wakaba 1.1
2899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2900     $self->{line_prev} = $self->{line};
2901     $self->{column_prev} = $self->{column};
2902     $self->{column}++;
2903     $self->{nc}
2904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2905     } else {
2906     $self->{set_nc}->($self);
2907     }
2908    
2909    
2910     return ($self->{ct}); # comment
2911    
2912     redo A;
2913     } elsif ($self->{nc} == 0x002D) { # -
2914    
2915 wakaba 1.10 ## XML5: Not a parse error.
2916 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2917     line => $self->{line_prev},
2918     column => $self->{column_prev});
2919     $self->{ct}->{data} .= '-'; # comment
2920     ## Stay in the state
2921    
2922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2923     $self->{line_prev} = $self->{line};
2924     $self->{column_prev} = $self->{column};
2925     $self->{column}++;
2926     $self->{nc}
2927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2928     } else {
2929     $self->{set_nc}->($self);
2930     }
2931    
2932     redo A;
2933     } elsif ($self->{nc} == -1) {
2934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2935 wakaba 1.13 if ($self->{in_subset}) {
2936    
2937     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2938     } else {
2939    
2940     $self->{state} = DATA_STATE;
2941     $self->{s_kwd} = '';
2942     }
2943 wakaba 1.1 ## reconsume
2944    
2945     return ($self->{ct}); # comment
2946    
2947     redo A;
2948     } else {
2949    
2950 wakaba 1.10 ## XML5: Not a parse error.
2951 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2952     line => $self->{line_prev},
2953     column => $self->{column_prev});
2954     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2955     $self->{state} = COMMENT_STATE;
2956    
2957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2958     $self->{line_prev} = $self->{line};
2959     $self->{column_prev} = $self->{column};
2960     $self->{column}++;
2961     $self->{nc}
2962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2963     } else {
2964     $self->{set_nc}->($self);
2965     }
2966    
2967     redo A;
2968     }
2969     } elsif ($self->{state} == DOCTYPE_STATE) {
2970     if ($is_space->{$self->{nc}}) {
2971    
2972     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     } else {
2986    
2987 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2988 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2989     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990     ## reconsume
2991     redo A;
2992     }
2993     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2994 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2995    
2996 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2997    
2998     ## Stay in the state
2999    
3000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3001     $self->{line_prev} = $self->{line};
3002     $self->{column_prev} = $self->{column};
3003     $self->{column}++;
3004     $self->{nc}
3005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3006     } else {
3007     $self->{set_nc}->($self);
3008     }
3009    
3010     redo A;
3011     } elsif ($self->{nc} == 0x003E) { # >
3012    
3013 wakaba 1.12 ## XML5: No parse error.
3014 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3015     $self->{state} = DATA_STATE;
3016 wakaba 1.5 $self->{s_kwd} = '';
3017 wakaba 1.1
3018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3019     $self->{line_prev} = $self->{line};
3020     $self->{column_prev} = $self->{column};
3021     $self->{column}++;
3022     $self->{nc}
3023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3024     } else {
3025     $self->{set_nc}->($self);
3026     }
3027    
3028    
3029     return ($self->{ct}); # DOCTYPE (quirks)
3030    
3031     redo A;
3032     } elsif ($self->{nc} == -1) {
3033    
3034     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3035     $self->{state} = DATA_STATE;
3036 wakaba 1.5 $self->{s_kwd} = '';
3037 wakaba 1.1 ## reconsume
3038    
3039     return ($self->{ct}); # DOCTYPE (quirks)
3040    
3041     redo A;
3042 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3043    
3044     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3045     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3046 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3047     $self->{in_subset} = 1;
3048 wakaba 1.12
3049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3050     $self->{line_prev} = $self->{line};
3051     $self->{column_prev} = $self->{column};
3052     $self->{column}++;
3053     $self->{nc}
3054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3055     } else {
3056     $self->{set_nc}->($self);
3057     }
3058    
3059 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3060 wakaba 1.12 redo A;
3061 wakaba 1.1 } else {
3062    
3063     $self->{ct}->{name} = chr $self->{nc};
3064     delete $self->{ct}->{quirks};
3065     $self->{state} = DOCTYPE_NAME_STATE;
3066    
3067     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3068     $self->{line_prev} = $self->{line};
3069     $self->{column_prev} = $self->{column};
3070     $self->{column}++;
3071     $self->{nc}
3072     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3073     } else {
3074     $self->{set_nc}->($self);
3075     }
3076    
3077     redo A;
3078     }
3079     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3080 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3081    
3082     ## ISSUE: Redundant "First," in the spec.
3083    
3084 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3085    
3086     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3087    
3088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3089     $self->{line_prev} = $self->{line};
3090     $self->{column_prev} = $self->{column};
3091     $self->{column}++;
3092     $self->{nc}
3093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3094     } else {
3095     $self->{set_nc}->($self);
3096     }
3097    
3098     redo A;
3099     } elsif ($self->{nc} == 0x003E) { # >
3100    
3101     $self->{state} = DATA_STATE;
3102 wakaba 1.5 $self->{s_kwd} = '';
3103 wakaba 1.1
3104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105     $self->{line_prev} = $self->{line};
3106     $self->{column_prev} = $self->{column};
3107     $self->{column}++;
3108     $self->{nc}
3109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110     } else {
3111     $self->{set_nc}->($self);
3112     }
3113    
3114    
3115     return ($self->{ct}); # DOCTYPE
3116    
3117     redo A;
3118     } elsif ($self->{nc} == -1) {
3119    
3120     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3121     $self->{state} = DATA_STATE;
3122 wakaba 1.5 $self->{s_kwd} = '';
3123 wakaba 1.1 ## reconsume
3124    
3125     $self->{ct}->{quirks} = 1;
3126     return ($self->{ct}); # DOCTYPE
3127    
3128     redo A;
3129 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3130    
3131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3132 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3133     $self->{in_subset} = 1;
3134 wakaba 1.12
3135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3136     $self->{line_prev} = $self->{line};
3137     $self->{column_prev} = $self->{column};
3138     $self->{column}++;
3139     $self->{nc}
3140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3141     } else {
3142     $self->{set_nc}->($self);
3143     }
3144    
3145 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3146 wakaba 1.12 redo A;
3147 wakaba 1.1 } else {
3148    
3149     $self->{ct}->{name}
3150     .= chr ($self->{nc}); # DOCTYPE
3151     ## Stay in the state
3152    
3153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3154     $self->{line_prev} = $self->{line};
3155     $self->{column_prev} = $self->{column};
3156     $self->{column}++;
3157     $self->{nc}
3158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3159     } else {
3160     $self->{set_nc}->($self);
3161     }
3162    
3163     redo A;
3164     }
3165     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3166 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3167     ## state", but implemented differently.
3168    
3169 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3170    
3171     ## Stay in the state
3172    
3173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3174     $self->{line_prev} = $self->{line};
3175     $self->{column_prev} = $self->{column};
3176     $self->{column}++;
3177     $self->{nc}
3178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3179     } else {
3180     $self->{set_nc}->($self);
3181     }
3182    
3183     redo A;
3184     } elsif ($self->{nc} == 0x003E) { # >
3185    
3186     $self->{state} = DATA_STATE;
3187 wakaba 1.5 $self->{s_kwd} = '';
3188 wakaba 1.1
3189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3190     $self->{line_prev} = $self->{line};
3191     $self->{column_prev} = $self->{column};
3192     $self->{column}++;
3193     $self->{nc}
3194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3195     } else {
3196     $self->{set_nc}->($self);
3197     }
3198    
3199    
3200     return ($self->{ct}); # DOCTYPE
3201    
3202     redo A;
3203     } elsif ($self->{nc} == -1) {
3204    
3205     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3206     $self->{state} = DATA_STATE;
3207 wakaba 1.5 $self->{s_kwd} = '';
3208 wakaba 1.1 ## reconsume
3209    
3210     $self->{ct}->{quirks} = 1;
3211     return ($self->{ct}); # DOCTYPE
3212    
3213     redo A;
3214     } elsif ($self->{nc} == 0x0050 or # P
3215     $self->{nc} == 0x0070) { # p
3216 wakaba 1.12
3217 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3218 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3219 wakaba 1.1
3220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3221     $self->{line_prev} = $self->{line};
3222     $self->{column_prev} = $self->{column};
3223     $self->{column}++;
3224     $self->{nc}
3225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3226     } else {
3227     $self->{set_nc}->($self);
3228     }
3229    
3230     redo A;
3231     } elsif ($self->{nc} == 0x0053 or # S
3232     $self->{nc} == 0x0073) { # s
3233 wakaba 1.12
3234 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3235 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3236    
3237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238     $self->{line_prev} = $self->{line};
3239     $self->{column_prev} = $self->{column};
3240     $self->{column}++;
3241     $self->{nc}
3242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243     } else {
3244     $self->{set_nc}->($self);
3245     }
3246    
3247     redo A;
3248     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3249    
3250     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3251     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3252 wakaba 1.13 $self->{in_subset} = 1;
3253 wakaba 1.1
3254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255     $self->{line_prev} = $self->{line};
3256     $self->{column_prev} = $self->{column};
3257     $self->{column}++;
3258     $self->{nc}
3259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3260     } else {
3261     $self->{set_nc}->($self);
3262     }
3263    
3264 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3265 wakaba 1.1 redo A;
3266     } else {
3267    
3268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
3269     $self->{ct}->{quirks} = 1;
3270    
3271     $self->{state} = BOGUS_DOCTYPE_STATE;
3272    
3273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3274     $self->{line_prev} = $self->{line};
3275     $self->{column_prev} = $self->{column};
3276     $self->{column}++;
3277     $self->{nc}
3278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3279     } else {
3280     $self->{set_nc}->($self);
3281     }
3282    
3283     redo A;
3284     }
3285     } elsif ($self->{state} == PUBLIC_STATE) {
3286     ## ASCII case-insensitive
3287     if ($self->{nc} == [
3288     undef,
3289     0x0055, # U
3290     0x0042, # B
3291     0x004C, # L
3292     0x0049, # I
3293 wakaba 1.12 ]->[length $self->{kwd}] or
3294 wakaba 1.1 $self->{nc} == [
3295     undef,
3296     0x0075, # u
3297     0x0062, # b
3298     0x006C, # l
3299     0x0069, # i
3300 wakaba 1.12 ]->[length $self->{kwd}]) {
3301 wakaba 1.1
3302     ## Stay in the state.
3303 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3304 wakaba 1.1
3305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3306     $self->{line_prev} = $self->{line};
3307     $self->{column_prev} = $self->{column};
3308     $self->{column}++;
3309     $self->{nc}
3310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3311     } else {
3312     $self->{set_nc}->($self);
3313     }
3314    
3315     redo A;
3316 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3317 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3318     $self->{nc} == 0x0063)) { # c
3319 wakaba 1.12 if ($self->{is_xml} and
3320     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3321    
3322     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3323     text => 'PUBLIC',
3324     line => $self->{line_prev},
3325     column => $self->{column_prev} - 4);
3326     } else {
3327    
3328     }
3329 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3330    
3331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332     $self->{line_prev} = $self->{line};
3333     $self->{column_prev} = $self->{column};
3334     $self->{column}++;
3335     $self->{nc}
3336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337     } else {
3338     $self->{set_nc}->($self);
3339     }
3340    
3341     redo A;
3342     } else {
3343    
3344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3345     line => $self->{line_prev},
3346 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3347 wakaba 1.1 $self->{ct}->{quirks} = 1;
3348    
3349     $self->{state} = BOGUS_DOCTYPE_STATE;
3350     ## Reconsume.
3351     redo A;
3352     }
3353     } elsif ($self->{state} == SYSTEM_STATE) {
3354     ## ASCII case-insensitive
3355     if ($self->{nc} == [
3356     undef,
3357     0x0059, # Y
3358     0x0053, # S
3359     0x0054, # T
3360     0x0045, # E
3361 wakaba 1.12 ]->[length $self->{kwd}] or
3362 wakaba 1.1 $self->{nc} == [
3363     undef,
3364     0x0079, # y
3365     0x0073, # s
3366     0x0074, # t
3367     0x0065, # e
3368 wakaba 1.12 ]->[length $self->{kwd}]) {
3369 wakaba 1.1
3370     ## Stay in the state.
3371 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3372 wakaba 1.1
3373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3374     $self->{line_prev} = $self->{line};
3375     $self->{column_prev} = $self->{column};
3376     $self->{column}++;
3377     $self->{nc}
3378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3379     } else {
3380     $self->{set_nc}->($self);
3381     }
3382    
3383     redo A;
3384 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3385 wakaba 1.1 ($self->{nc} == 0x004D or # M
3386     $self->{nc} == 0x006D)) { # m
3387 wakaba 1.12 if ($self->{is_xml} and
3388     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3389    
3390     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3391     text => 'SYSTEM',
3392     line => $self->{line_prev},
3393     column => $self->{column_prev} - 4);
3394     } else {
3395    
3396     }
3397 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3398    
3399     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3400     $self->{line_prev} = $self->{line};
3401     $self->{column_prev} = $self->{column};
3402     $self->{column}++;
3403     $self->{nc}
3404     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3405     } else {
3406     $self->{set_nc}->($self);
3407     }
3408    
3409     redo A;
3410     } else {
3411    
3412     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3413     line => $self->{line_prev},
3414 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3415 wakaba 1.1 $self->{ct}->{quirks} = 1;
3416    
3417     $self->{state} = BOGUS_DOCTYPE_STATE;
3418     ## Reconsume.
3419     redo A;
3420     }
3421     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3422     if ($is_space->{$self->{nc}}) {
3423    
3424     ## Stay in the state
3425    
3426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3427     $self->{line_prev} = $self->{line};
3428     $self->{column_prev} = $self->{column};
3429     $self->{column}++;
3430     $self->{nc}
3431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3432     } else {
3433     $self->{set_nc}->($self);
3434     }
3435    
3436     redo A;
3437     } elsif ($self->{nc} eq 0x0022) { # "
3438    
3439     $self->{ct}->{pubid} = ''; # DOCTYPE
3440     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3441    
3442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3443     $self->{line_prev} = $self->{line};
3444     $self->{column_prev} = $self->{column};
3445     $self->{column}++;
3446     $self->{nc}
3447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3448     } else {
3449     $self->{set_nc}->($self);
3450     }
3451    
3452     redo A;
3453     } elsif ($self->{nc} eq 0x0027) { # '
3454    
3455     $self->{ct}->{pubid} = ''; # DOCTYPE
3456     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3457    
3458     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3459     $self->{line_prev} = $self->{line};
3460     $self->{column_prev} = $self->{column};
3461     $self->{column}++;
3462     $self->{nc}
3463     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3464     } else {
3465     $self->{set_nc}->($self);
3466     }
3467    
3468     redo A;
3469     } elsif ($self->{nc} eq 0x003E) { # >
3470    
3471     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3472    
3473     $self->{state} = DATA_STATE;
3474 wakaba 1.5 $self->{s_kwd} = '';
3475 wakaba 1.1
3476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3477     $self->{line_prev} = $self->{line};
3478     $self->{column_prev} = $self->{column};
3479     $self->{column}++;
3480     $self->{nc}
3481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3482     } else {
3483     $self->{set_nc}->($self);
3484     }
3485    
3486    
3487     $self->{ct}->{quirks} = 1;
3488     return ($self->{ct}); # DOCTYPE
3489    
3490     redo A;
3491     } elsif ($self->{nc} == -1) {
3492    
3493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3494    
3495     $self->{state} = DATA_STATE;
3496 wakaba 1.5 $self->{s_kwd} = '';
3497 wakaba 1.1 ## reconsume
3498    
3499     $self->{ct}->{quirks} = 1;
3500     return ($self->{ct}); # DOCTYPE
3501    
3502     redo A;
3503 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3504    
3505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3506     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3507     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3508 wakaba 1.13 $self->{in_subset} = 1;
3509 wakaba 1.12
3510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3511     $self->{line_prev} = $self->{line};
3512     $self->{column_prev} = $self->{column};
3513     $self->{column}++;
3514     $self->{nc}
3515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3516     } else {
3517     $self->{set_nc}->($self);
3518     }
3519    
3520 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3521 wakaba 1.12 redo A;
3522 wakaba 1.1 } else {
3523    
3524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3525     $self->{ct}->{quirks} = 1;
3526    
3527     $self->{state} = BOGUS_DOCTYPE_STATE;
3528    
3529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3530     $self->{line_prev} = $self->{line};
3531     $self->{column_prev} = $self->{column};
3532     $self->{column}++;
3533     $self->{nc}
3534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3535     } else {
3536     $self->{set_nc}->($self);
3537     }
3538    
3539     redo A;
3540     }
3541     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3542     if ($self->{nc} == 0x0022) { # "
3543    
3544     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3545    
3546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3547     $self->{line_prev} = $self->{line};
3548     $self->{column_prev} = $self->{column};
3549     $self->{column}++;
3550     $self->{nc}
3551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3552     } else {
3553     $self->{set_nc}->($self);
3554     }
3555    
3556     redo A;
3557     } elsif ($self->{nc} == 0x003E) { # >
3558    
3559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3560    
3561     $self->{state} = DATA_STATE;
3562 wakaba 1.5 $self->{s_kwd} = '';
3563 wakaba 1.1
3564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3565     $self->{line_prev} = $self->{line};
3566     $self->{column_prev} = $self->{column};
3567     $self->{column}++;
3568     $self->{nc}
3569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3570     } else {
3571     $self->{set_nc}->($self);
3572     }
3573    
3574    
3575     $self->{ct}->{quirks} = 1;
3576     return ($self->{ct}); # DOCTYPE
3577    
3578     redo A;
3579     } elsif ($self->{nc} == -1) {
3580    
3581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3582    
3583     $self->{state} = DATA_STATE;
3584 wakaba 1.5 $self->{s_kwd} = '';
3585 wakaba 1.1 ## reconsume
3586    
3587     $self->{ct}->{quirks} = 1;
3588     return ($self->{ct}); # DOCTYPE
3589    
3590     redo A;
3591     } else {
3592    
3593     $self->{ct}->{pubid} # DOCTYPE
3594     .= chr $self->{nc};
3595     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3596     length $self->{ct}->{pubid});
3597    
3598     ## Stay in the state
3599    
3600     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3601     $self->{line_prev} = $self->{line};
3602     $self->{column_prev} = $self->{column};
3603     $self->{column}++;
3604     $self->{nc}
3605     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3606     } else {
3607     $self->{set_nc}->($self);
3608     }
3609    
3610     redo A;
3611     }
3612     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3613     if ($self->{nc} == 0x0027) { # '
3614    
3615     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3616    
3617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3618     $self->{line_prev} = $self->{line};
3619     $self->{column_prev} = $self->{column};
3620     $self->{column}++;
3621     $self->{nc}
3622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3623     } else {
3624     $self->{set_nc}->($self);
3625     }
3626    
3627     redo A;
3628     } elsif ($self->{nc} == 0x003E) { # >
3629    
3630     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3631    
3632     $self->{state} = DATA_STATE;
3633 wakaba 1.5 $self->{s_kwd} = '';
3634 wakaba 1.1
3635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3636     $self->{line_prev} = $self->{line};
3637     $self->{column_prev} = $self->{column};
3638     $self->{column}++;
3639     $self->{nc}
3640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3641     } else {
3642     $self->{set_nc}->($self);
3643     }
3644    
3645    
3646     $self->{ct}->{quirks} = 1;
3647     return ($self->{ct}); # DOCTYPE
3648    
3649     redo A;
3650     } elsif ($self->{nc} == -1) {
3651    
3652     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3653    
3654     $self->{state} = DATA_STATE;
3655 wakaba 1.5 $self->{s_kwd} = '';
3656 wakaba 1.1 ## reconsume
3657    
3658     $self->{ct}->{quirks} = 1;
3659     return ($self->{ct}); # DOCTYPE
3660    
3661     redo A;
3662     } else {
3663    
3664     $self->{ct}->{pubid} # DOCTYPE
3665     .= chr $self->{nc};
3666     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3667     length $self->{ct}->{pubid});
3668    
3669     ## Stay in the state
3670    
3671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3672     $self->{line_prev} = $self->{line};
3673     $self->{column_prev} = $self->{column};
3674     $self->{column}++;
3675     $self->{nc}
3676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3677     } else {
3678     $self->{set_nc}->($self);
3679     }
3680    
3681     redo A;
3682     }
3683     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3684     if ($is_space->{$self->{nc}}) {
3685    
3686     ## Stay in the state
3687    
3688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3689     $self->{line_prev} = $self->{line};
3690     $self->{column_prev} = $self->{column};
3691     $self->{column}++;
3692     $self->{nc}
3693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3694     } else {
3695     $self->{set_nc}->($self);
3696     }
3697    
3698     redo A;
3699     } elsif ($self->{nc} == 0x0022) { # "
3700    
3701     $self->{ct}->{sysid} = ''; # DOCTYPE
3702     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3703    
3704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3705     $self->{line_prev} = $self->{line};
3706     $self->{column_prev} = $self->{column};
3707     $self->{column}++;
3708     $self->{nc}
3709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3710     } else {
3711     $self->{set_nc}->($self);
3712     }
3713    
3714     redo A;
3715     } elsif ($self->{nc} == 0x0027) { # '
3716    
3717     $self->{ct}->{sysid} = ''; # DOCTYPE
3718     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3719    
3720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3721     $self->{line_prev} = $self->{line};
3722     $self->{column_prev} = $self->{column};
3723     $self->{column}++;
3724     $self->{nc}
3725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3726     } else {
3727     $self->{set_nc}->($self);
3728     }
3729    
3730     redo A;
3731     } elsif ($self->{nc} == 0x003E) { # >
3732 wakaba 1.12 if ($self->{is_xml}) {
3733    
3734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3735     } else {
3736    
3737     }
3738 wakaba 1.1 $self->{state} = DATA_STATE;
3739 wakaba 1.5 $self->{s_kwd} = '';
3740 wakaba 1.1
3741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3742     $self->{line_prev} = $self->{line};
3743     $self->{column_prev} = $self->{column};
3744     $self->{column}++;
3745     $self->{nc}
3746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3747     } else {
3748     $self->{set_nc}->($self);
3749     }
3750    
3751    
3752     return ($self->{ct}); # DOCTYPE
3753    
3754     redo A;
3755     } elsif ($self->{nc} == -1) {
3756    
3757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3758    
3759     $self->{state} = DATA_STATE;
3760 wakaba 1.5 $self->{s_kwd} = '';
3761 wakaba 1.1 ## reconsume
3762    
3763     $self->{ct}->{quirks} = 1;
3764     return ($self->{ct}); # DOCTYPE
3765    
3766     redo A;
3767 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3768    
3769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3771     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3772 wakaba 1.13 $self->{in_subset} = 1;
3773 wakaba 1.12
3774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3775     $self->{line_prev} = $self->{line};
3776     $self->{column_prev} = $self->{column};
3777     $self->{column}++;
3778     $self->{nc}
3779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3780     } else {
3781     $self->{set_nc}->($self);
3782     }
3783    
3784 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3785 wakaba 1.12 redo A;
3786 wakaba 1.1 } else {
3787    
3788     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3789     $self->{ct}->{quirks} = 1;
3790    
3791     $self->{state} = BOGUS_DOCTYPE_STATE;
3792    
3793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3794     $self->{line_prev} = $self->{line};
3795     $self->{column_prev} = $self->{column};
3796     $self->{column}++;
3797     $self->{nc}
3798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3799     } else {
3800     $self->{set_nc}->($self);
3801     }
3802    
3803     redo A;
3804     }
3805     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3806     if ($is_space->{$self->{nc}}) {
3807    
3808     ## Stay in the state
3809    
3810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3811     $self->{line_prev} = $self->{line};
3812     $self->{column_prev} = $self->{column};
3813     $self->{column}++;
3814     $self->{nc}
3815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3816     } else {
3817     $self->{set_nc}->($self);
3818     }
3819    
3820     redo A;
3821     } elsif ($self->{nc} == 0x0022) { # "
3822    
3823     $self->{ct}->{sysid} = ''; # DOCTYPE
3824     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3825    
3826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3827     $self->{line_prev} = $self->{line};
3828     $self->{column_prev} = $self->{column};
3829     $self->{column}++;
3830     $self->{nc}
3831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3832     } else {
3833     $self->{set_nc}->($self);
3834     }
3835    
3836     redo A;
3837     } elsif ($self->{nc} == 0x0027) { # '
3838    
3839     $self->{ct}->{sysid} = ''; # DOCTYPE
3840     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3841    
3842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843     $self->{line_prev} = $self->{line};
3844     $self->{column_prev} = $self->{column};
3845     $self->{column}++;
3846     $self->{nc}
3847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848     } else {
3849     $self->{set_nc}->($self);
3850     }
3851    
3852     redo A;
3853     } elsif ($self->{nc} == 0x003E) { # >
3854    
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856     $self->{state} = DATA_STATE;
3857 wakaba 1.5 $self->{s_kwd} = '';
3858 wakaba 1.1
3859     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3860     $self->{line_prev} = $self->{line};
3861     $self->{column_prev} = $self->{column};
3862     $self->{column}++;
3863     $self->{nc}
3864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3865     } else {
3866     $self->{set_nc}->($self);
3867     }
3868    
3869    
3870     $self->{ct}->{quirks} = 1;
3871     return ($self->{ct}); # DOCTYPE
3872    
3873     redo A;
3874     } elsif ($self->{nc} == -1) {
3875    
3876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877    
3878     $self->{state} = DATA_STATE;
3879 wakaba 1.5 $self->{s_kwd} = '';
3880 wakaba 1.1 ## reconsume
3881    
3882     $self->{ct}->{quirks} = 1;
3883     return ($self->{ct}); # DOCTYPE
3884    
3885     redo A;
3886 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3887    
3888     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3889    
3890     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3891     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3892 wakaba 1.13 $self->{in_subset} = 1;
3893 wakaba 1.12
3894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3895     $self->{line_prev} = $self->{line};
3896     $self->{column_prev} = $self->{column};
3897     $self->{column}++;
3898     $self->{nc}
3899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3900     } else {
3901     $self->{set_nc}->($self);
3902     }
3903    
3904 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3905 wakaba 1.12 redo A;
3906 wakaba 1.1 } else {
3907    
3908     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3909     $self->{ct}->{quirks} = 1;
3910    
3911     $self->{state} = BOGUS_DOCTYPE_STATE;
3912    
3913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3914     $self->{line_prev} = $self->{line};
3915     $self->{column_prev} = $self->{column};
3916     $self->{column}++;
3917     $self->{nc}
3918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3919     } else {
3920     $self->{set_nc}->($self);
3921     }
3922    
3923     redo A;
3924     }
3925     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3926     if ($self->{nc} == 0x0022) { # "
3927    
3928     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3929    
3930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3931     $self->{line_prev} = $self->{line};
3932     $self->{column_prev} = $self->{column};
3933     $self->{column}++;
3934     $self->{nc}
3935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3936     } else {
3937     $self->{set_nc}->($self);
3938     }
3939    
3940     redo A;
3941 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3942 wakaba 1.1
3943     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3944    
3945     $self->{state} = DATA_STATE;
3946 wakaba 1.5 $self->{s_kwd} = '';
3947 wakaba 1.1
3948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3949     $self->{line_prev} = $self->{line};
3950     $self->{column_prev} = $self->{column};
3951     $self->{column}++;
3952     $self->{nc}
3953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3954     } else {
3955     $self->{set_nc}->($self);
3956     }
3957    
3958    
3959     $self->{ct}->{quirks} = 1;
3960     return ($self->{ct}); # DOCTYPE
3961    
3962     redo A;
3963     } elsif ($self->{nc} == -1) {
3964    
3965     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3966    
3967     $self->{state} = DATA_STATE;
3968 wakaba 1.5 $self->{s_kwd} = '';
3969 wakaba 1.1 ## reconsume
3970    
3971     $self->{ct}->{quirks} = 1;
3972     return ($self->{ct}); # DOCTYPE
3973    
3974     redo A;
3975     } else {
3976    
3977     $self->{ct}->{sysid} # DOCTYPE
3978     .= chr $self->{nc};
3979     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3980     length $self->{ct}->{sysid});
3981    
3982     ## Stay in the state
3983    
3984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3985     $self->{line_prev} = $self->{line};
3986     $self->{column_prev} = $self->{column};
3987     $self->{column}++;
3988     $self->{nc}
3989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3990     } else {
3991     $self->{set_nc}->($self);
3992     }
3993    
3994     redo A;
3995     }
3996     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3997     if ($self->{nc} == 0x0027) { # '
3998    
3999     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4000    
4001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4002     $self->{line_prev} = $self->{line};
4003     $self->{column_prev} = $self->{column};
4004     $self->{column}++;
4005     $self->{nc}
4006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4007     } else {
4008     $self->{set_nc}->($self);
4009     }
4010    
4011     redo A;
4012 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4013 wakaba 1.1
4014     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4015    
4016     $self->{state} = DATA_STATE;
4017 wakaba 1.5 $self->{s_kwd} = '';
4018 wakaba 1.1
4019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4020     $self->{line_prev} = $self->{line};
4021     $self->{column_prev} = $self->{column};
4022     $self->{column}++;
4023     $self->{nc}
4024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4025     } else {
4026     $self->{set_nc}->($self);
4027     }
4028    
4029    
4030     $self->{ct}->{quirks} = 1;
4031     return ($self->{ct}); # DOCTYPE
4032    
4033     redo A;
4034     } elsif ($self->{nc} == -1) {
4035    
4036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4037    
4038     $self->{state} = DATA_STATE;
4039 wakaba 1.5 $self->{s_kwd} = '';
4040 wakaba 1.1 ## reconsume
4041    
4042     $self->{ct}->{quirks} = 1;
4043     return ($self->{ct}); # DOCTYPE
4044    
4045     redo A;
4046     } else {
4047    
4048     $self->{ct}->{sysid} # DOCTYPE
4049     .= chr $self->{nc};
4050     $self->{read_until}->($self->{ct}->{sysid}, q['>],
4051     length $self->{ct}->{sysid});
4052    
4053     ## Stay in the state
4054    
4055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4056     $self->{line_prev} = $self->{line};
4057     $self->{column_prev} = $self->{column};
4058     $self->{column}++;
4059     $self->{nc}
4060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4061     } else {
4062     $self->{set_nc}->($self);
4063     }
4064    
4065     redo A;
4066     }
4067     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4068     if ($is_space->{$self->{nc}}) {
4069    
4070     ## Stay in the state
4071    
4072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4073     $self->{line_prev} = $self->{line};
4074     $self->{column_prev} = $self->{column};
4075     $self->{column}++;
4076     $self->{nc}
4077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4078     } else {
4079     $self->{set_nc}->($self);
4080     }
4081    
4082     redo A;
4083     } elsif ($self->{nc} == 0x003E) { # >
4084    
4085     $self->{state} = DATA_STATE;
4086 wakaba 1.5 $self->{s_kwd} = '';
4087 wakaba 1.1
4088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4089     $self->{line_prev} = $self->{line};
4090     $self->{column_prev} = $self->{column};
4091     $self->{column}++;
4092     $self->{nc}
4093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4094     } else {
4095     $self->{set_nc}->($self);
4096     }
4097    
4098    
4099     return ($self->{ct}); # DOCTYPE
4100    
4101     redo A;
4102     } elsif ($self->{nc} == -1) {
4103    
4104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4105     $self->{state} = DATA_STATE;
4106 wakaba 1.5 $self->{s_kwd} = '';
4107 wakaba 1.1 ## reconsume
4108    
4109     $self->{ct}->{quirks} = 1;
4110     return ($self->{ct}); # DOCTYPE
4111    
4112     redo A;
4113 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4114    
4115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4116     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4117 wakaba 1.13 $self->{in_subset} = 1;
4118 wakaba 1.12
4119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4120     $self->{line_prev} = $self->{line};
4121     $self->{column_prev} = $self->{column};
4122     $self->{column}++;
4123     $self->{nc}
4124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4125     } else {
4126     $self->{set_nc}->($self);
4127     }
4128    
4129 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4130 wakaba 1.12 redo A;
4131 wakaba 1.1 } else {
4132    
4133     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4134     #$self->{ct}->{quirks} = 1;
4135    
4136     $self->{state} = BOGUS_DOCTYPE_STATE;
4137    
4138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139     $self->{line_prev} = $self->{line};
4140     $self->{column_prev} = $self->{column};
4141     $self->{column}++;
4142     $self->{nc}
4143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144     } else {
4145     $self->{set_nc}->($self);
4146     }
4147    
4148     redo A;
4149     }
4150     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4151     if ($self->{nc} == 0x003E) { # >
4152    
4153     $self->{state} = DATA_STATE;
4154 wakaba 1.5 $self->{s_kwd} = '';
4155 wakaba 1.1
4156     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4157     $self->{line_prev} = $self->{line};
4158     $self->{column_prev} = $self->{column};
4159     $self->{column}++;
4160     $self->{nc}
4161     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4162     } else {
4163     $self->{set_nc}->($self);
4164     }
4165    
4166    
4167     return ($self->{ct}); # DOCTYPE
4168    
4169     redo A;
4170 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4171 wakaba 1.13
4172     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4173     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4174     $self->{in_subset} = 1;
4175    
4176 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4177     $self->{line_prev} = $self->{line};
4178     $self->{column_prev} = $self->{column};
4179     $self->{column}++;
4180     $self->{nc}
4181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4182     } else {
4183     $self->{set_nc}->($self);
4184     }
4185    
4186 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4187     redo A;
4188 wakaba 1.1 } elsif ($self->{nc} == -1) {
4189    
4190     $self->{state} = DATA_STATE;
4191 wakaba 1.5 $self->{s_kwd} = '';
4192 wakaba 1.1 ## reconsume
4193    
4194     return ($self->{ct}); # DOCTYPE
4195    
4196     redo A;
4197     } else {
4198    
4199     my $s = '';
4200 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4201 wakaba 1.1
4202     ## Stay in the state
4203    
4204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4205     $self->{line_prev} = $self->{line};
4206     $self->{column_prev} = $self->{column};
4207     $self->{column}++;
4208     $self->{nc}
4209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4210     } else {
4211     $self->{set_nc}->($self);
4212     }
4213    
4214     redo A;
4215     }
4216     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4217     ## NOTE: "CDATA section state" in the state is jointly implemented
4218     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4219     ## and |CDATA_SECTION_MSE2_STATE|.
4220 wakaba 1.10
4221     ## XML5: "CDATA state".
4222 wakaba 1.1
4223     if ($self->{nc} == 0x005D) { # ]
4224    
4225     $self->{state} = CDATA_SECTION_MSE1_STATE;
4226    
4227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4228     $self->{line_prev} = $self->{line};
4229     $self->{column_prev} = $self->{column};
4230     $self->{column}++;
4231     $self->{nc}
4232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4233     } else {
4234     $self->{set_nc}->($self);
4235     }
4236    
4237     redo A;
4238     } elsif ($self->{nc} == -1) {
4239 wakaba 1.6 if ($self->{is_xml}) {
4240 wakaba 1.8
4241 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4242 wakaba 1.8 } else {
4243    
4244 wakaba 1.6 }
4245    
4246 wakaba 1.1 $self->{state} = DATA_STATE;
4247 wakaba 1.5 $self->{s_kwd} = '';
4248 wakaba 1.10 ## Reconsume.
4249 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4250    
4251     return ($self->{ct}); # character
4252     } else {
4253    
4254     ## No token to emit. $self->{ct} is discarded.
4255     }
4256     redo A;
4257     } else {
4258    
4259     $self->{ct}->{data} .= chr $self->{nc};
4260     $self->{read_until}->($self->{ct}->{data},
4261     q<]>,
4262     length $self->{ct}->{data});
4263    
4264     ## Stay in the state.
4265    
4266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267     $self->{line_prev} = $self->{line};
4268     $self->{column_prev} = $self->{column};
4269     $self->{column}++;
4270     $self->{nc}
4271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272     } else {
4273     $self->{set_nc}->($self);
4274     }
4275    
4276     redo A;
4277     }
4278    
4279     ## ISSUE: "text tokens" in spec.
4280     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4281 wakaba 1.10 ## XML5: "CDATA bracket state".
4282    
4283 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4284    
4285     $self->{state} = CDATA_SECTION_MSE2_STATE;
4286    
4287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4288     $self->{line_prev} = $self->{line};
4289     $self->{column_prev} = $self->{column};
4290     $self->{column}++;
4291     $self->{nc}
4292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4293     } else {
4294     $self->{set_nc}->($self);
4295     }
4296    
4297     redo A;
4298     } else {
4299    
4300 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4301 wakaba 1.1 $self->{ct}->{data} .= ']';
4302 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4303 wakaba 1.1 ## Reconsume.
4304     redo A;
4305     }
4306     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4307 wakaba 1.10 ## XML5: "CDATA end state".
4308    
4309 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4310     $self->{state} = DATA_STATE;
4311 wakaba 1.5 $self->{s_kwd} = '';
4312 wakaba 1.1
4313     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4314     $self->{line_prev} = $self->{line};
4315     $self->{column_prev} = $self->{column};
4316     $self->{column}++;
4317     $self->{nc}
4318     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4319     } else {
4320     $self->{set_nc}->($self);
4321     }
4322    
4323     if (length $self->{ct}->{data}) { # character
4324    
4325     return ($self->{ct}); # character
4326     } else {
4327    
4328     ## No token to emit. $self->{ct} is discarded.
4329     }
4330     redo A;
4331     } elsif ($self->{nc} == 0x005D) { # ]
4332     # character
4333     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4334     ## Stay in the state.
4335    
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     } else {
4348    
4349     $self->{ct}->{data} .= ']]'; # character
4350     $self->{state} = CDATA_SECTION_STATE;
4351 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4352 wakaba 1.1 redo A;
4353     }
4354     } elsif ($self->{state} == ENTITY_STATE) {
4355     if ($is_space->{$self->{nc}} or
4356     {
4357     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4358     $self->{entity_add} => 1,
4359     }->{$self->{nc}}) {
4360    
4361     ## Don't consume
4362     ## No error
4363     ## Return nothing.
4364     #
4365     } elsif ($self->{nc} == 0x0023) { # #
4366    
4367     $self->{state} = ENTITY_HASH_STATE;
4368 wakaba 1.12 $self->{kwd} = '#';
4369 wakaba 1.1
4370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4371     $self->{line_prev} = $self->{line};
4372     $self->{column_prev} = $self->{column};
4373     $self->{column}++;
4374     $self->{nc}
4375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4376     } else {
4377     $self->{set_nc}->($self);
4378     }
4379    
4380     redo A;
4381     } elsif ((0x0041 <= $self->{nc} and
4382     $self->{nc} <= 0x005A) or # A..Z
4383     (0x0061 <= $self->{nc} and
4384     $self->{nc} <= 0x007A)) { # a..z
4385    
4386     require Whatpm::_NamedEntityList;
4387     $self->{state} = ENTITY_NAME_STATE;
4388 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4389     $self->{entity__value} = $self->{kwd};
4390 wakaba 1.1 $self->{entity__match} = 0;
4391    
4392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4393     $self->{line_prev} = $self->{line};
4394     $self->{column_prev} = $self->{column};
4395     $self->{column}++;
4396     $self->{nc}
4397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4398     } else {
4399     $self->{set_nc}->($self);
4400     }
4401    
4402     redo A;
4403     } else {
4404    
4405     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4406     ## Return nothing.
4407     #
4408     }
4409    
4410     ## NOTE: No character is consumed by the "consume a character
4411     ## reference" algorithm. In other word, there is an "&" character
4412     ## that does not introduce a character reference, which would be
4413     ## appended to the parent element or the attribute value in later
4414     ## process of the tokenizer.
4415    
4416     if ($self->{prev_state} == DATA_STATE) {
4417    
4418     $self->{state} = $self->{prev_state};
4419 wakaba 1.5 $self->{s_kwd} = '';
4420 wakaba 1.1 ## Reconsume.
4421     return ({type => CHARACTER_TOKEN, data => '&',
4422     line => $self->{line_prev},
4423     column => $self->{column_prev},
4424     });
4425     redo A;
4426     } else {
4427    
4428     $self->{ca}->{value} .= '&';
4429     $self->{state} = $self->{prev_state};
4430 wakaba 1.5 $self->{s_kwd} = '';
4431 wakaba 1.1 ## Reconsume.
4432     redo A;
4433     }
4434     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4435     if ($self->{nc} == 0x0078 or # x
4436     $self->{nc} == 0x0058) { # X
4437    
4438     $self->{state} = HEXREF_X_STATE;
4439 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4440 wakaba 1.1
4441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442     $self->{line_prev} = $self->{line};
4443     $self->{column_prev} = $self->{column};
4444     $self->{column}++;
4445     $self->{nc}
4446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4447     } else {
4448     $self->{set_nc}->($self);
4449     }
4450    
4451     redo A;
4452     } elsif (0x0030 <= $self->{nc} and
4453     $self->{nc} <= 0x0039) { # 0..9
4454    
4455     $self->{state} = NCR_NUM_STATE;
4456 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4457 wakaba 1.1
4458     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4459     $self->{line_prev} = $self->{line};
4460     $self->{column_prev} = $self->{column};
4461     $self->{column}++;
4462     $self->{nc}
4463     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4464     } else {
4465     $self->{set_nc}->($self);
4466     }
4467    
4468     redo A;
4469     } else {
4470     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4471     line => $self->{line_prev},
4472     column => $self->{column_prev} - 1);
4473    
4474     ## NOTE: According to the spec algorithm, nothing is returned,
4475     ## and then "&#" is appended to the parent element or the attribute
4476     ## value in the later processing.
4477    
4478     if ($self->{prev_state} == DATA_STATE) {
4479    
4480     $self->{state} = $self->{prev_state};
4481 wakaba 1.5 $self->{s_kwd} = '';
4482 wakaba 1.1 ## Reconsume.
4483     return ({type => CHARACTER_TOKEN,
4484     data => '&#',
4485     line => $self->{line_prev},
4486     column => $self->{column_prev} - 1,
4487     });
4488     redo A;
4489     } else {
4490    
4491     $self->{ca}->{value} .= '&#';
4492     $self->{state} = $self->{prev_state};
4493 wakaba 1.5 $self->{s_kwd} = '';
4494 wakaba 1.1 ## Reconsume.
4495     redo A;
4496     }
4497     }
4498     } elsif ($self->{state} == NCR_NUM_STATE) {
4499     if (0x0030 <= $self->{nc} and
4500     $self->{nc} <= 0x0039) { # 0..9
4501    
4502 wakaba 1.12 $self->{kwd} *= 10;
4503     $self->{kwd} += $self->{nc} - 0x0030;
4504 wakaba 1.1
4505     ## Stay in the state.
4506    
4507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4508     $self->{line_prev} = $self->{line};
4509     $self->{column_prev} = $self->{column};
4510     $self->{column}++;
4511     $self->{nc}
4512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4513     } else {
4514     $self->{set_nc}->($self);
4515     }
4516    
4517     redo A;
4518     } elsif ($self->{nc} == 0x003B) { # ;
4519    
4520    
4521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4522     $self->{line_prev} = $self->{line};
4523     $self->{column_prev} = $self->{column};
4524     $self->{column}++;
4525     $self->{nc}
4526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4527     } else {
4528     $self->{set_nc}->($self);
4529     }
4530    
4531     #
4532     } else {
4533    
4534     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4535     ## Reconsume.
4536     #
4537     }
4538    
4539 wakaba 1.12 my $code = $self->{kwd};
4540 wakaba 1.1 my $l = $self->{line_prev};
4541     my $c = $self->{column_prev};
4542     if ($charref_map->{$code}) {
4543    
4544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4545     text => (sprintf 'U+%04X', $code),
4546     line => $l, column => $c);
4547     $code = $charref_map->{$code};
4548     } elsif ($code > 0x10FFFF) {
4549    
4550     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4551     text => (sprintf 'U-%08X', $code),
4552     line => $l, column => $c);
4553     $code = 0xFFFD;
4554     }
4555    
4556     if ($self->{prev_state} == DATA_STATE) {
4557    
4558     $self->{state} = $self->{prev_state};
4559 wakaba 1.5 $self->{s_kwd} = '';
4560 wakaba 1.1 ## Reconsume.
4561     return ({type => CHARACTER_TOKEN, data => chr $code,
4562 wakaba 1.7 has_reference => 1,
4563 wakaba 1.1 line => $l, column => $c,
4564     });
4565     redo A;
4566     } else {
4567    
4568     $self->{ca}->{value} .= chr $code;
4569     $self->{ca}->{has_reference} = 1;
4570     $self->{state} = $self->{prev_state};
4571 wakaba 1.5 $self->{s_kwd} = '';
4572 wakaba 1.1 ## Reconsume.
4573     redo A;
4574     }
4575     } elsif ($self->{state} == HEXREF_X_STATE) {
4576     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4577     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4578     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4579     # 0..9, A..F, a..f
4580    
4581     $self->{state} = HEXREF_HEX_STATE;
4582 wakaba 1.12 $self->{kwd} = 0;
4583 wakaba 1.1 ## Reconsume.
4584     redo A;
4585     } else {
4586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4587     line => $self->{line_prev},
4588     column => $self->{column_prev} - 2);
4589    
4590     ## NOTE: According to the spec algorithm, nothing is returned,
4591     ## and then "&#" followed by "X" or "x" is appended to the parent
4592     ## element or the attribute value in the later processing.
4593    
4594     if ($self->{prev_state} == DATA_STATE) {
4595    
4596     $self->{state} = $self->{prev_state};
4597 wakaba 1.5 $self->{s_kwd} = '';
4598 wakaba 1.1 ## Reconsume.
4599     return ({type => CHARACTER_TOKEN,
4600 wakaba 1.12 data => '&' . $self->{kwd},
4601 wakaba 1.1 line => $self->{line_prev},
4602 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4603 wakaba 1.1 });
4604     redo A;
4605     } else {
4606    
4607 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4608 wakaba 1.1 $self->{state} = $self->{prev_state};
4609 wakaba 1.5 $self->{s_kwd} = '';
4610 wakaba 1.1 ## Reconsume.
4611     redo A;
4612     }
4613     }
4614     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4615     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4616     # 0..9
4617    
4618 wakaba 1.12 $self->{kwd} *= 0x10;
4619     $self->{kwd} += $self->{nc} - 0x0030;
4620 wakaba 1.1 ## Stay in the state.
4621    
4622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4623     $self->{line_prev} = $self->{line};
4624     $self->{column_prev} = $self->{column};
4625     $self->{column}++;
4626     $self->{nc}
4627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4628     } else {
4629     $self->{set_nc}->($self);
4630     }
4631    
4632     redo A;
4633     } elsif (0x0061 <= $self->{nc} and
4634     $self->{nc} <= 0x0066) { # a..f
4635    
4636 wakaba 1.12 $self->{kwd} *= 0x10;
4637     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4638 wakaba 1.1 ## Stay in the state.
4639    
4640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4641     $self->{line_prev} = $self->{line};
4642     $self->{column_prev} = $self->{column};
4643     $self->{column}++;
4644     $self->{nc}
4645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4646     } else {
4647     $self->{set_nc}->($self);
4648     }
4649    
4650     redo A;
4651     } elsif (0x0041 <= $self->{nc} and
4652     $self->{nc} <= 0x0046) { # A..F
4653    
4654 wakaba 1.12 $self->{kwd} *= 0x10;
4655     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4656 wakaba 1.1 ## Stay in the state.
4657    
4658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4659     $self->{line_prev} = $self->{line};
4660     $self->{column_prev} = $self->{column};
4661     $self->{column}++;
4662     $self->{nc}
4663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4664     } else {
4665     $self->{set_nc}->($self);
4666     }
4667    
4668     redo A;
4669     } elsif ($self->{nc} == 0x003B) { # ;
4670    
4671    
4672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673     $self->{line_prev} = $self->{line};
4674     $self->{column_prev} = $self->{column};
4675     $self->{column}++;
4676     $self->{nc}
4677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678     } else {
4679     $self->{set_nc}->($self);
4680     }
4681    
4682     #
4683     } else {
4684    
4685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4686     line => $self->{line},
4687     column => $self->{column});
4688     ## Reconsume.
4689     #
4690     }
4691    
4692 wakaba 1.12 my $code = $self->{kwd};
4693 wakaba 1.1 my $l = $self->{line_prev};
4694     my $c = $self->{column_prev};
4695     if ($charref_map->{$code}) {
4696    
4697     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4698     text => (sprintf 'U+%04X', $code),
4699     line => $l, column => $c);
4700     $code = $charref_map->{$code};
4701     } elsif ($code > 0x10FFFF) {
4702    
4703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4704     text => (sprintf 'U-%08X', $code),
4705     line => $l, column => $c);
4706     $code = 0xFFFD;
4707     }
4708    
4709     if ($self->{prev_state} == DATA_STATE) {
4710    
4711     $self->{state} = $self->{prev_state};
4712 wakaba 1.5 $self->{s_kwd} = '';
4713 wakaba 1.1 ## Reconsume.
4714     return ({type => CHARACTER_TOKEN, data => chr $code,
4715 wakaba 1.7 has_reference => 1,
4716 wakaba 1.1 line => $l, column => $c,
4717     });
4718     redo A;
4719     } else {
4720    
4721     $self->{ca}->{value} .= chr $code;
4722     $self->{ca}->{has_reference} = 1;
4723     $self->{state} = $self->{prev_state};
4724 wakaba 1.5 $self->{s_kwd} = '';
4725 wakaba 1.1 ## Reconsume.
4726     redo A;
4727     }
4728     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4729 wakaba 1.12 if (length $self->{kwd} < 30 and
4730 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4731     ((0x0041 <= $self->{nc} and # a
4732     $self->{nc} <= 0x005A) or # x
4733     (0x0061 <= $self->{nc} and # a
4734     $self->{nc} <= 0x007A) or # z
4735     (0x0030 <= $self->{nc} and # 0
4736     $self->{nc} <= 0x0039) or # 9
4737     $self->{nc} == 0x003B)) { # ;
4738     our $EntityChar;
4739 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4740     if (defined $EntityChar->{$self->{kwd}}) {
4741 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4742    
4743 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4744 wakaba 1.1 $self->{entity__match} = 1;
4745    
4746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4747     $self->{line_prev} = $self->{line};
4748     $self->{column_prev} = $self->{column};
4749     $self->{column}++;
4750     $self->{nc}
4751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4752     } else {
4753     $self->{set_nc}->($self);
4754     }
4755    
4756     #
4757     } else {
4758    
4759 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4760 wakaba 1.1 $self->{entity__match} = -1;
4761     ## Stay in the state.
4762    
4763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4764     $self->{line_prev} = $self->{line};
4765     $self->{column_prev} = $self->{column};
4766     $self->{column}++;
4767     $self->{nc}
4768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4769     } else {
4770     $self->{set_nc}->($self);
4771     }
4772    
4773     redo A;
4774     }
4775     } else {
4776    
4777     $self->{entity__value} .= chr $self->{nc};
4778     $self->{entity__match} *= 2;
4779     ## Stay in the state.
4780    
4781     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4782     $self->{line_prev} = $self->{line};
4783     $self->{column_prev} = $self->{column};
4784     $self->{column}++;
4785     $self->{nc}
4786     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4787     } else {
4788     $self->{set_nc}->($self);
4789     }
4790    
4791     redo A;
4792     }
4793     }
4794    
4795     my $data;
4796     my $has_ref;
4797     if ($self->{entity__match} > 0) {
4798    
4799     $data = $self->{entity__value};
4800     $has_ref = 1;
4801     #
4802     } elsif ($self->{entity__match} < 0) {
4803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4804     if ($self->{prev_state} != DATA_STATE and # in attribute
4805     $self->{entity__match} < -1) {
4806    
4807 wakaba 1.12 $data = '&' . $self->{kwd};
4808 wakaba 1.1 #
4809     } else {
4810    
4811     $data = $self->{entity__value};
4812     $has_ref = 1;
4813     #
4814     }
4815     } else {
4816    
4817     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4818     line => $self->{line_prev},
4819 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
4820     $data = '&' . $self->{kwd};
4821 wakaba 1.1 #
4822     }
4823    
4824     ## NOTE: In these cases, when a character reference is found,
4825     ## it is consumed and a character token is returned, or, otherwise,
4826     ## nothing is consumed and returned, according to the spec algorithm.
4827     ## In this implementation, anything that has been examined by the
4828     ## tokenizer is appended to the parent element or the attribute value
4829     ## as string, either literal string when no character reference or
4830     ## entity-replaced string otherwise, in this stage, since any characters
4831     ## that would not be consumed are appended in the data state or in an
4832     ## appropriate attribute value state anyway.
4833    
4834     if ($self->{prev_state} == DATA_STATE) {
4835    
4836     $self->{state} = $self->{prev_state};
4837 wakaba 1.5 $self->{s_kwd} = '';
4838 wakaba 1.1 ## Reconsume.
4839     return ({type => CHARACTER_TOKEN,
4840     data => $data,
4841 wakaba 1.7 has_reference => $has_ref,
4842 wakaba 1.1 line => $self->{line_prev},
4843 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
4844 wakaba 1.1 });
4845     redo A;
4846     } else {
4847    
4848     $self->{ca}->{value} .= $data;
4849     $self->{ca}->{has_reference} = 1 if $has_ref;
4850     $self->{state} = $self->{prev_state};
4851 wakaba 1.5 $self->{s_kwd} = '';
4852 wakaba 1.1 ## Reconsume.
4853     redo A;
4854     }
4855 wakaba 1.8
4856     ## XML-only states
4857    
4858     } elsif ($self->{state} == PI_STATE) {
4859 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
4860    
4861 wakaba 1.8 if ($is_space->{$self->{nc}} or
4862 wakaba 1.14 $self->{nc} == 0x003F or # ?
4863 wakaba 1.8 $self->{nc} == -1) {
4864 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4865     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
4866     ## "DOCTYPE pi state": Parse error, switch to the "data
4867     ## state".
4868 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4869     line => $self->{line_prev},
4870     column => $self->{column_prev}
4871     - 1 * ($self->{nc} != -1));
4872     $self->{state} = BOGUS_COMMENT_STATE;
4873     ## Reconsume.
4874     $self->{ct} = {type => COMMENT_TOKEN,
4875     data => '?',
4876     line => $self->{line_prev},
4877     column => $self->{column_prev}
4878     - 1 * ($self->{nc} != -1),
4879     };
4880     redo A;
4881     } else {
4882 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
4883 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
4884     target => chr $self->{nc},
4885     data => '',
4886     line => $self->{line_prev},
4887     column => $self->{column_prev} - 1,
4888     };
4889     $self->{state} = PI_TARGET_STATE;
4890    
4891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4892     $self->{line_prev} = $self->{line};
4893     $self->{column_prev} = $self->{column};
4894     $self->{column}++;
4895     $self->{nc}
4896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4897     } else {
4898     $self->{set_nc}->($self);
4899     }
4900    
4901     redo A;
4902     }
4903     } elsif ($self->{state} == PI_TARGET_STATE) {
4904     if ($is_space->{$self->{nc}}) {
4905     $self->{state} = PI_TARGET_AFTER_STATE;
4906    
4907     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4908     $self->{line_prev} = $self->{line};
4909     $self->{column_prev} = $self->{column};
4910     $self->{column}++;
4911     $self->{nc}
4912     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4913     } else {
4914     $self->{set_nc}->($self);
4915     }
4916    
4917     redo A;
4918     } elsif ($self->{nc} == -1) {
4919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4920 wakaba 1.13 if ($self->{in_subset}) {
4921     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4922     } else {
4923     $self->{state} = DATA_STATE;
4924     $self->{s_kwd} = '';
4925     }
4926 wakaba 1.8 ## Reconsume.
4927     return ($self->{ct}); # pi
4928     redo A;
4929     } elsif ($self->{nc} == 0x003F) { # ?
4930     $self->{state} = PI_AFTER_STATE;
4931    
4932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4933     $self->{line_prev} = $self->{line};
4934     $self->{column_prev} = $self->{column};
4935     $self->{column}++;
4936     $self->{nc}
4937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4938     } else {
4939     $self->{set_nc}->($self);
4940     }
4941    
4942     redo A;
4943     } else {
4944     ## XML5: typo ("tag name" -> "target")
4945     $self->{ct}->{target} .= chr $self->{nc}; # pi
4946    
4947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4948     $self->{line_prev} = $self->{line};
4949     $self->{column_prev} = $self->{column};
4950     $self->{column}++;
4951     $self->{nc}
4952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4953     } else {
4954     $self->{set_nc}->($self);
4955     }
4956    
4957     redo A;
4958     }
4959     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4960     if ($is_space->{$self->{nc}}) {
4961     ## Stay in the state.
4962    
4963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964     $self->{line_prev} = $self->{line};
4965     $self->{column_prev} = $self->{column};
4966     $self->{column}++;
4967     $self->{nc}
4968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969     } else {
4970     $self->{set_nc}->($self);
4971     }
4972    
4973     redo A;
4974     } else {
4975     $self->{state} = PI_DATA_STATE;
4976     ## Reprocess.
4977     redo A;
4978     }
4979     } elsif ($self->{state} == PI_DATA_STATE) {
4980     if ($self->{nc} == 0x003F) { # ?
4981     $self->{state} = PI_DATA_AFTER_STATE;
4982    
4983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4984     $self->{line_prev} = $self->{line};
4985     $self->{column_prev} = $self->{column};
4986     $self->{column}++;
4987     $self->{nc}
4988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4989     } else {
4990     $self->{set_nc}->($self);
4991     }
4992    
4993     redo A;
4994     } elsif ($self->{nc} == -1) {
4995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4996 wakaba 1.13 if ($self->{in_subset}) {
4997 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
4998 wakaba 1.13 } else {
4999     $self->{state} = DATA_STATE;
5000     $self->{s_kwd} = '';
5001     }
5002 wakaba 1.8 ## Reprocess.
5003     return ($self->{ct}); # pi
5004     redo A;
5005     } else {
5006     $self->{ct}->{data} .= chr $self->{nc}; # pi
5007     $self->{read_until}->($self->{ct}->{data}, q[?],
5008     length $self->{ct}->{data});
5009     ## Stay in the state.
5010    
5011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5012     $self->{line_prev} = $self->{line};
5013     $self->{column_prev} = $self->{column};
5014     $self->{column}++;
5015     $self->{nc}
5016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5017     } else {
5018     $self->{set_nc}->($self);
5019     }
5020    
5021     ## Reprocess.
5022     redo A;
5023     }
5024     } elsif ($self->{state} == PI_AFTER_STATE) {
5025 wakaba 1.14 ## XML5: Part of "Pi after state".
5026    
5027 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5028 wakaba 1.13 if ($self->{in_subset}) {
5029     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5030     } else {
5031     $self->{state} = DATA_STATE;
5032     $self->{s_kwd} = '';
5033     }
5034 wakaba 1.8
5035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5036     $self->{line_prev} = $self->{line};
5037     $self->{column_prev} = $self->{column};
5038     $self->{column}++;
5039     $self->{nc}
5040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5041     } else {
5042     $self->{set_nc}->($self);
5043     }
5044    
5045     return ($self->{ct}); # pi
5046     redo A;
5047     } elsif ($self->{nc} == 0x003F) { # ?
5048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5049     line => $self->{line_prev},
5050     column => $self->{column_prev}); ## XML5: no error
5051     $self->{ct}->{data} .= '?';
5052     $self->{state} = PI_DATA_AFTER_STATE;
5053    
5054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5055     $self->{line_prev} = $self->{line};
5056     $self->{column_prev} = $self->{column};
5057     $self->{column}++;
5058     $self->{nc}
5059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5060     } else {
5061     $self->{set_nc}->($self);
5062     }
5063    
5064     redo A;
5065     } else {
5066     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5067     line => $self->{line_prev},
5068     column => $self->{column_prev}
5069     + 1 * ($self->{nc} == -1)); ## XML5: no error
5070     $self->{ct}->{data} .= '?'; ## XML5: not appended
5071     $self->{state} = PI_DATA_STATE;
5072     ## Reprocess.
5073     redo A;
5074     }
5075     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5076 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5077    
5078 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5079 wakaba 1.13 if ($self->{in_subset}) {
5080     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5081     } else {
5082     $self->{state} = DATA_STATE;
5083     $self->{s_kwd} = '';
5084     }
5085 wakaba 1.8
5086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5087     $self->{line_prev} = $self->{line};
5088     $self->{column_prev} = $self->{column};
5089     $self->{column}++;
5090     $self->{nc}
5091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5092     } else {
5093     $self->{set_nc}->($self);
5094     }
5095    
5096     return ($self->{ct}); # pi
5097     redo A;
5098     } elsif ($self->{nc} == 0x003F) { # ?
5099     $self->{ct}->{data} .= '?';
5100     ## Stay in the state.
5101    
5102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5103     $self->{line_prev} = $self->{line};
5104     $self->{column_prev} = $self->{column};
5105     $self->{column}++;
5106     $self->{nc}
5107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5108     } else {
5109     $self->{set_nc}->($self);
5110     }
5111    
5112     redo A;
5113     } else {
5114     $self->{ct}->{data} .= '?'; ## XML5: not appended
5115     $self->{state} = PI_DATA_STATE;
5116     ## Reprocess.
5117     redo A;
5118     }
5119 wakaba 1.12
5120     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5121     if ($self->{nc} == 0x003C) { # <
5122 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5123 wakaba 1.12
5124     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5125     $self->{line_prev} = $self->{line};
5126     $self->{column_prev} = $self->{column};
5127     $self->{column}++;
5128     $self->{nc}
5129     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5130     } else {
5131     $self->{set_nc}->($self);
5132     }
5133    
5134     redo A;
5135     } elsif ($self->{nc} == 0x0025) { # %
5136     ## XML5: Not defined yet.
5137    
5138     ## TODO:
5139    
5140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5141     $self->{line_prev} = $self->{line};
5142     $self->{column_prev} = $self->{column};
5143     $self->{column}++;
5144     $self->{nc}
5145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5146     } else {
5147     $self->{set_nc}->($self);
5148     }
5149    
5150     redo A;
5151     } elsif ($self->{nc} == 0x005D) { # ]
5152 wakaba 1.13 delete $self->{in_subset};
5153 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5154    
5155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5156     $self->{line_prev} = $self->{line};
5157     $self->{column_prev} = $self->{column};
5158     $self->{column}++;
5159     $self->{nc}
5160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5161     } else {
5162     $self->{set_nc}->($self);
5163     }
5164    
5165     redo A;
5166     } elsif ($is_space->{$self->{nc}}) {
5167     ## Stay in the state.
5168    
5169     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5170     $self->{line_prev} = $self->{line};
5171     $self->{column_prev} = $self->{column};
5172     $self->{column}++;
5173     $self->{nc}
5174     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5175     } else {
5176     $self->{set_nc}->($self);
5177     }
5178    
5179     redo A;
5180     } elsif ($self->{nc} == -1) {
5181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5182 wakaba 1.13 delete $self->{in_subset};
5183 wakaba 1.12 $self->{state} = DATA_STATE;
5184     $self->{s_kwd} = '';
5185     ## Reconsume.
5186 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5187 wakaba 1.12 redo A;
5188     } else {
5189     unless ($self->{internal_subset_tainted}) {
5190     ## XML5: No parse error.
5191     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5192     $self->{internal_subset_tainted} = 1;
5193     }
5194     ## Stay in the state.
5195    
5196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5197     $self->{line_prev} = $self->{line};
5198     $self->{column_prev} = $self->{column};
5199     $self->{column}++;
5200     $self->{nc}
5201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5202     } else {
5203     $self->{set_nc}->($self);
5204     }
5205    
5206     redo A;
5207     }
5208     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5209     if ($self->{nc} == 0x003E) { # >
5210     $self->{state} = DATA_STATE;
5211     $self->{s_kwd} = '';
5212    
5213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5214     $self->{line_prev} = $self->{line};
5215     $self->{column_prev} = $self->{column};
5216     $self->{column}++;
5217     $self->{nc}
5218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5219     } else {
5220     $self->{set_nc}->($self);
5221     }
5222    
5223 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5224 wakaba 1.12 redo A;
5225     } elsif ($self->{nc} == -1) {
5226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5227     $self->{state} = DATA_STATE;
5228     $self->{s_kwd} = '';
5229     ## Reconsume.
5230 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5231 wakaba 1.12 redo A;
5232     } else {
5233     ## XML5: No parse error and stay in the state.
5234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5235    
5236 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5237    
5238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5239     $self->{line_prev} = $self->{line};
5240     $self->{column_prev} = $self->{column};
5241     $self->{column}++;
5242     $self->{nc}
5243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5244     } else {
5245     $self->{set_nc}->($self);
5246     }
5247    
5248     redo A;
5249     }
5250     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5251     if ($self->{nc} == 0x003E) { # >
5252     $self->{state} = DATA_STATE;
5253     $self->{s_kwd} = '';
5254    
5255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5256     $self->{line_prev} = $self->{line};
5257     $self->{column_prev} = $self->{column};
5258     $self->{column}++;
5259     $self->{nc}
5260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5261     } else {
5262     $self->{set_nc}->($self);
5263     }
5264    
5265     return ({type => END_OF_DOCTYPE_TOKEN});
5266     redo A;
5267     } elsif ($self->{nc} == -1) {
5268     $self->{state} = DATA_STATE;
5269     $self->{s_kwd} = '';
5270     ## Reconsume.
5271     return ({type => END_OF_DOCTYPE_TOKEN});
5272     redo A;
5273     } else {
5274     ## Stay in the state.
5275    
5276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5277     $self->{line_prev} = $self->{line};
5278     $self->{column_prev} = $self->{column};
5279     $self->{column}++;
5280     $self->{nc}
5281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5282     } else {
5283     $self->{set_nc}->($self);
5284     }
5285    
5286     redo A;
5287     }
5288     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5289     if ($self->{nc} == 0x0021) { # !
5290 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5291 wakaba 1.13
5292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5293     $self->{line_prev} = $self->{line};
5294     $self->{column_prev} = $self->{column};
5295     $self->{column}++;
5296     $self->{nc}
5297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5298     } else {
5299     $self->{set_nc}->($self);
5300     }
5301    
5302     redo A;
5303     } elsif ($self->{nc} == 0x003F) { # ?
5304     $self->{state} = PI_STATE;
5305    
5306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5307     $self->{line_prev} = $self->{line};
5308     $self->{column_prev} = $self->{column};
5309     $self->{column}++;
5310     $self->{nc}
5311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5312     } else {
5313     $self->{set_nc}->($self);
5314     }
5315    
5316     redo A;
5317     } elsif ($self->{nc} == -1) {
5318     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5319     $self->{state} = DATA_STATE;
5320     $self->{s_kwd} = '';
5321     ## Reconsume.
5322     redo A;
5323     } else {
5324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5325     line => $self->{line_prev},
5326     column => $self->{column_prev});
5327     $self->{state} = BOGUS_COMMENT_STATE;
5328     $self->{ct} = {type => COMMENT_TOKEN,
5329     data => '',
5330     }; ## NOTE: Will be discarded.
5331 wakaba 1.12
5332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5333     $self->{line_prev} = $self->{line};
5334     $self->{column_prev} = $self->{column};
5335     $self->{column}++;
5336     $self->{nc}
5337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5338     } else {
5339     $self->{set_nc}->($self);
5340     }
5341    
5342     redo A;
5343     }
5344 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5345     ## XML5: "DOCTYPE markup declaration state".
5346    
5347     if ($self->{nc} == 0x002D) { # -
5348     $self->{state} = MD_HYPHEN_STATE;
5349    
5350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351     $self->{line_prev} = $self->{line};
5352     $self->{column_prev} = $self->{column};
5353     $self->{column}++;
5354     $self->{nc}
5355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356     } else {
5357     $self->{set_nc}->($self);
5358     }
5359    
5360     redo A;
5361     } elsif ($self->{nc} == 0x0045) { # E
5362     $self->{state} = MD_E_STATE;
5363     $self->{kwd} = chr $self->{nc};
5364    
5365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5366     $self->{line_prev} = $self->{line};
5367     $self->{column_prev} = $self->{column};
5368     $self->{column}++;
5369     $self->{nc}
5370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5371     } else {
5372     $self->{set_nc}->($self);
5373     }
5374    
5375     redo A;
5376     } elsif ($self->{nc} == 0x0041) { # A
5377     $self->{state} = MD_ATTLIST_STATE;
5378     $self->{kwd} = chr $self->{nc};
5379    
5380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5381     $self->{line_prev} = $self->{line};
5382     $self->{column_prev} = $self->{column};
5383     $self->{column}++;
5384     $self->{nc}
5385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5386     } else {
5387     $self->{set_nc}->($self);
5388     }
5389    
5390     redo A;
5391     } elsif ($self->{nc} == 0x004E) { # N
5392     $self->{state} = MD_NOTATION_STATE;
5393     $self->{kwd} = chr $self->{nc};
5394    
5395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396     $self->{line_prev} = $self->{line};
5397     $self->{column_prev} = $self->{column};
5398     $self->{column}++;
5399     $self->{nc}
5400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401     } else {
5402     $self->{set_nc}->($self);
5403     }
5404    
5405     redo A;
5406     } else {
5407     #
5408     }
5409    
5410     ## XML5: No parse error.
5411     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5412     line => $self->{line_prev},
5413     column => $self->{column_prev} - 1);
5414     ## Reconsume.
5415     $self->{state} = BOGUS_COMMENT_STATE;
5416     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5417     redo A;
5418     } elsif ($self->{state} == MD_E_STATE) {
5419     if ($self->{nc} == 0x004E) { # N
5420     $self->{state} = MD_ENTITY_STATE;
5421     $self->{kwd} .= chr $self->{nc};
5422    
5423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5424     $self->{line_prev} = $self->{line};
5425     $self->{column_prev} = $self->{column};
5426     $self->{column}++;
5427     $self->{nc}
5428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5429     } else {
5430     $self->{set_nc}->($self);
5431     }
5432    
5433     redo A;
5434     } elsif ($self->{nc} == 0x004C) { # L
5435     ## XML5: <!ELEMENT> not supported.
5436     $self->{state} = MD_ELEMENT_STATE;
5437     $self->{kwd} .= chr $self->{nc};
5438    
5439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5440     $self->{line_prev} = $self->{line};
5441     $self->{column_prev} = $self->{column};
5442     $self->{column}++;
5443     $self->{nc}
5444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5445     } else {
5446     $self->{set_nc}->($self);
5447     }
5448    
5449     redo A;
5450     } else {
5451     ## XML5: No parse error.
5452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5453     line => $self->{line_prev},
5454     column => $self->{column_prev} - 2
5455     + 1 * ($self->{nc} == -1));
5456     ## Reconsume.
5457     $self->{state} = BOGUS_COMMENT_STATE;
5458     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5459     redo A;
5460     }
5461     } elsif ($self->{state} == MD_ENTITY_STATE) {
5462     if ($self->{nc} == {
5463     'EN' => 0x0054, # T
5464     'ENT' => 0x0049, # I
5465     'ENTI' => 0x0054, # T
5466     }->{$self->{kwd}}) {
5467     ## Stay in the state.
5468     $self->{kwd} .= chr $self->{nc};
5469    
5470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5471     $self->{line_prev} = $self->{line};
5472     $self->{column_prev} = $self->{column};
5473     $self->{column}++;
5474     $self->{nc}
5475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5476     } else {
5477     $self->{set_nc}->($self);
5478     }
5479    
5480     redo A;
5481     } elsif ($self->{kwd} eq 'ENTIT' and
5482     $self->{nc} == 0x0059) { # Y
5483     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
5484     line => $self->{line_prev},
5485     column => $self->{column_prev} - 6};
5486     $self->{state} = DOCTYPE_MD_STATE;
5487    
5488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5489     $self->{line_prev} = $self->{line};
5490     $self->{column_prev} = $self->{column};
5491     $self->{column}++;
5492     $self->{nc}
5493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5494     } else {
5495     $self->{set_nc}->($self);
5496     }
5497    
5498     redo A;
5499     } else {
5500     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5501     line => $self->{line_prev},
5502     column => $self->{column_prev} - 1
5503     - (length $self->{kwd})
5504     + 1 * ($self->{nc} == -1));
5505     $self->{state} = BOGUS_COMMENT_STATE;
5506     ## Reconsume.
5507     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5508     redo A;
5509     }
5510     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5511     if ($self->{nc} == {
5512     'EL' => 0x0045, # E
5513     'ELE' => 0x004D, # M
5514     'ELEM' => 0x0045, # E
5515     'ELEME' => 0x004E, # N
5516     }->{$self->{kwd}}) {
5517     ## Stay in the state.
5518     $self->{kwd} .= chr $self->{nc};
5519    
5520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5521     $self->{line_prev} = $self->{line};
5522     $self->{column_prev} = $self->{column};
5523     $self->{column}++;
5524     $self->{nc}
5525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5526     } else {
5527     $self->{set_nc}->($self);
5528     }
5529    
5530     redo A;
5531     } elsif ($self->{kwd} eq 'ELEMEN' and
5532     $self->{nc} == 0x0054) { # T
5533     $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5534     line => $self->{line_prev},
5535     column => $self->{column_prev} - 6};
5536     $self->{state} = DOCTYPE_MD_STATE;
5537    
5538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5539     $self->{line_prev} = $self->{line};
5540     $self->{column_prev} = $self->{column};
5541     $self->{column}++;
5542     $self->{nc}
5543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5544     } else {
5545     $self->{set_nc}->($self);
5546     }
5547    
5548     redo A;
5549     } else {
5550     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5551     line => $self->{line_prev},
5552     column => $self->{column_prev} - 1
5553     - (length $self->{kwd})
5554     + 1 * ($self->{nc} == -1));
5555     $self->{state} = BOGUS_COMMENT_STATE;
5556     ## Reconsume.
5557     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5558     redo A;
5559     }
5560     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5561     if ($self->{nc} == {
5562     'A' => 0x0054, # T
5563     'AT' => 0x0054, # T
5564     'ATT' => 0x004C, # L
5565     'ATTL' => 0x0049, # I
5566     'ATTLI' => 0x0053, # S
5567     }->{$self->{kwd}}) {
5568     ## Stay in the state.
5569     $self->{kwd} .= chr $self->{nc};
5570    
5571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5572     $self->{line_prev} = $self->{line};
5573     $self->{column_prev} = $self->{column};
5574     $self->{column}++;
5575     $self->{nc}
5576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5577     } else {
5578     $self->{set_nc}->($self);
5579     }
5580    
5581     redo A;
5582     } elsif ($self->{kwd} eq 'ATTLIS' and
5583     $self->{nc} == 0x0054) { # T
5584     $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5585 wakaba 1.15 attrdefs => [],
5586 wakaba 1.14 line => $self->{line_prev},
5587     column => $self->{column_prev} - 6};
5588     $self->{state} = DOCTYPE_MD_STATE;
5589    
5590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591     $self->{line_prev} = $self->{line};
5592     $self->{column_prev} = $self->{column};
5593     $self->{column}++;
5594     $self->{nc}
5595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596     } else {
5597     $self->{set_nc}->($self);
5598     }
5599    
5600     redo A;
5601     } else {
5602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5603     line => $self->{line_prev},
5604     column => $self->{column_prev} - 1
5605     - (length $self->{kwd})
5606     + 1 * ($self->{nc} == -1));
5607     $self->{state} = BOGUS_COMMENT_STATE;
5608     ## Reconsume.
5609     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5610     redo A;
5611     }
5612     } elsif ($self->{state} == MD_NOTATION_STATE) {
5613     if ($self->{nc} == {
5614     'N' => 0x004F, # O
5615     'NO' => 0x0054, # T
5616     'NOT' => 0x0041, # A
5617     'NOTA' => 0x0054, # T
5618     'NOTAT' => 0x0049, # I
5619     'NOTATI' => 0x004F, # O
5620     }->{$self->{kwd}}) {
5621     ## Stay in the state.
5622     $self->{kwd} .= chr $self->{nc};
5623    
5624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625     $self->{line_prev} = $self->{line};
5626     $self->{column_prev} = $self->{column};
5627     $self->{column}++;
5628     $self->{nc}
5629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630     } else {
5631     $self->{set_nc}->($self);
5632     }
5633    
5634     redo A;
5635     } elsif ($self->{kwd} eq 'NOTATIO' and
5636     $self->{nc} == 0x004E) { # N
5637     $self->{ct} = {type => NOTATION_TOKEN, name => '',
5638     line => $self->{line_prev},
5639     column => $self->{column_prev} - 6};
5640     $self->{state} = DOCTYPE_MD_STATE;
5641    
5642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5643     $self->{line_prev} = $self->{line};
5644     $self->{column_prev} = $self->{column};
5645     $self->{column}++;
5646     $self->{nc}
5647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5648     } else {
5649     $self->{set_nc}->($self);
5650     }
5651    
5652     redo A;
5653     } else {
5654     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5655     line => $self->{line_prev},
5656     column => $self->{column_prev} - 1
5657     - (length $self->{kwd})
5658     + 1 * ($self->{nc} == -1));
5659     $self->{state} = BOGUS_COMMENT_STATE;
5660     ## Reconsume.
5661     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5662     redo A;
5663     }
5664     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5665     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5666     ## "DOCTYPE NOTATION state".
5667    
5668     if ($is_space->{$self->{nc}}) {
5669     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5670     $self->{state} = BEFORE_MD_NAME_STATE;
5671    
5672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5673     $self->{line_prev} = $self->{line};
5674     $self->{column_prev} = $self->{column};
5675     $self->{column}++;
5676     $self->{nc}
5677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5678     } else {
5679     $self->{set_nc}->($self);
5680     }
5681    
5682     redo A;
5683     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5684     $self->{nc} == 0x0025) { # %
5685     ## XML5: Switch to the "DOCTYPE bogus comment state".
5686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5687     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5688    
5689     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5690     $self->{line_prev} = $self->{line};
5691     $self->{column_prev} = $self->{column};
5692     $self->{column}++;
5693     $self->{nc}
5694     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5695     } else {
5696     $self->{set_nc}->($self);
5697     }
5698    
5699     redo A;
5700     } elsif ($self->{nc} == -1) {
5701     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5702     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5703     ## Reconsume.
5704     redo A;
5705     } elsif ($self->{nc} == 0x003E) { # >
5706     ## XML5: Switch to the "DOCTYPE bogus comment state".
5707     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5708     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5709    
5710     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5711     $self->{line_prev} = $self->{line};
5712     $self->{column_prev} = $self->{column};
5713     $self->{column}++;
5714     $self->{nc}
5715     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5716     } else {
5717     $self->{set_nc}->($self);
5718     }
5719    
5720     redo A;
5721     } else {
5722     ## XML5: Switch to the "DOCTYPE bogus comment state".
5723     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5724     $self->{state} = BEFORE_MD_NAME_STATE;
5725     redo A;
5726     }
5727     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5728     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5729     ## before state", "DOCTYPE ATTLIST name before state".
5730    
5731     if ($is_space->{$self->{nc}}) {
5732     ## Stay in the state.
5733    
5734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5735     $self->{line_prev} = $self->{line};
5736     $self->{column_prev} = $self->{column};
5737     $self->{column}++;
5738     $self->{nc}
5739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5740     } else {
5741     $self->{set_nc}->($self);
5742     }
5743    
5744     redo A;
5745     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5746     $self->{nc} == 0x0025) { # %
5747     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5748    
5749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5750     $self->{line_prev} = $self->{line};
5751     $self->{column_prev} = $self->{column};
5752     $self->{column}++;
5753     $self->{nc}
5754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5755     } else {
5756     $self->{set_nc}->($self);
5757     }
5758    
5759     redo A;
5760     } elsif ($self->{nc} == 0x003E) { # >
5761     ## XML5: Same as "Anything else".
5762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5763     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5764    
5765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5766     $self->{line_prev} = $self->{line};
5767     $self->{column_prev} = $self->{column};
5768     $self->{column}++;
5769     $self->{nc}
5770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5771     } else {
5772     $self->{set_nc}->($self);
5773     }
5774    
5775     redo A;
5776     } elsif ($self->{nc} == -1) {
5777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5778     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5779     ## Reconsume.
5780     redo A;
5781     } else {
5782     ## XML5: [ATTLIST] Not defined yet.
5783     $self->{ct}->{name} .= chr $self->{nc};
5784     $self->{state} = MD_NAME_STATE;
5785    
5786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5787     $self->{line_prev} = $self->{line};
5788     $self->{column_prev} = $self->{column};
5789     $self->{column}++;
5790     $self->{nc}
5791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5792     } else {
5793     $self->{set_nc}->($self);
5794     }
5795    
5796     redo A;
5797     }
5798     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5799     if ($is_space->{$self->{nc}}) {
5800     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5801     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5802     $self->{state} = BEFORE_MD_NAME_STATE;
5803 wakaba 1.8
5804 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5805     $self->{line_prev} = $self->{line};
5806     $self->{column_prev} = $self->{column};
5807     $self->{column}++;
5808     $self->{nc}
5809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5810     } else {
5811     $self->{set_nc}->($self);
5812     }
5813    
5814     redo A;
5815     } elsif ($self->{nc} == 0x003E) { # >
5816     ## XML5: Same as "Anything else".
5817     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5818     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5819    
5820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5821     $self->{line_prev} = $self->{line};
5822     $self->{column_prev} = $self->{column};
5823     $self->{column}++;
5824     $self->{nc}
5825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5826     } else {
5827     $self->{set_nc}->($self);
5828     }
5829    
5830     redo A;
5831     } elsif ($self->{nc} == -1) {
5832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5833     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5834     ## Reconsume.
5835     redo A;
5836     } else {
5837     ## XML5: No parse error.
5838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
5839     $self->{state} = BOGUS_COMMENT_STATE;
5840     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5841     ## Reconsume.
5842     redo A;
5843     }
5844     } elsif ($self->{state} == MD_NAME_STATE) {
5845     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
5846    
5847     if ($is_space->{$self->{nc}}) {
5848     ## TODO:
5849     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5850    
5851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5852     $self->{line_prev} = $self->{line};
5853     $self->{column_prev} = $self->{column};
5854     $self->{column}++;
5855     $self->{nc}
5856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5857     } else {
5858     $self->{set_nc}->($self);
5859     }
5860    
5861     redo A;
5862     } elsif ($self->{nc} == 0x003E) { # >
5863     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5864     #
5865     } else {
5866     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md body'); ## TODO: type
5867     }
5868     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5869    
5870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5871     $self->{line_prev} = $self->{line};
5872     $self->{column_prev} = $self->{column};
5873     $self->{column}++;
5874     $self->{nc}
5875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5876     } else {
5877     $self->{set_nc}->($self);
5878     }
5879    
5880     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5881     redo A;
5882     } elsif ($self->{nc} == -1) {
5883     ## XML5: [ATTLIST] No parse error.
5884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5886     ## Reconsume.
5887     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5888     redo A;
5889     } else {
5890     ## XML5: [ATTLIST] Not defined yet.
5891     $self->{ct}->{name} .= chr $self->{nc};
5892     ## Stay in the state.
5893    
5894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895     $self->{line_prev} = $self->{line};
5896     $self->{column_prev} = $self->{column};
5897     $self->{column}++;
5898     $self->{nc}
5899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900     } else {
5901     $self->{set_nc}->($self);
5902     }
5903    
5904     redo A;
5905     }
5906     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
5907     if ($is_space->{$self->{nc}}) {
5908     ## Stay in the state.
5909    
5910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5911     $self->{line_prev} = $self->{line};
5912     $self->{column_prev} = $self->{column};
5913     $self->{column}++;
5914     $self->{nc}
5915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5916     } else {
5917     $self->{set_nc}->($self);
5918     }
5919    
5920     redo A;
5921     } elsif ($self->{nc} == 0x003E) { # >
5922     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5923    
5924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5925     $self->{line_prev} = $self->{line};
5926     $self->{column_prev} = $self->{column};
5927     $self->{column}++;
5928     $self->{nc}
5929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5930     } else {
5931     $self->{set_nc}->($self);
5932     }
5933    
5934     return ($self->{ct}); # ATTLIST
5935     redo A;
5936     } elsif ($self->{nc} == -1) {
5937     ## XML5: No parse error.
5938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5939     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5940 wakaba 1.15 return ($self->{ct});
5941 wakaba 1.14 redo A;
5942     } else {
5943     ## XML5: Not defined yet.
5944 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
5945     tokens => [],
5946     line => $self->{line}, column => $self->{column}};
5947     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
5948    
5949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5950     $self->{line_prev} = $self->{line};
5951     $self->{column_prev} = $self->{column};
5952     $self->{column}++;
5953     $self->{nc}
5954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5955     } else {
5956     $self->{set_nc}->($self);
5957     }
5958    
5959     redo A;
5960     }
5961     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
5962     if ($is_space->{$self->{nc}}) {
5963     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
5964    
5965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5966     $self->{line_prev} = $self->{line};
5967     $self->{column_prev} = $self->{column};
5968     $self->{column}++;
5969     $self->{nc}
5970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5971     } else {
5972     $self->{set_nc}->($self);
5973     }
5974    
5975     redo A;
5976     } elsif ($self->{nc} == 0x003E) { # >
5977     ## XML5: Same as "anything else".
5978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
5979     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5980    
5981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5982     $self->{line_prev} = $self->{line};
5983     $self->{column_prev} = $self->{column};
5984     $self->{column}++;
5985     $self->{nc}
5986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5987     } else {
5988     $self->{set_nc}->($self);
5989     }
5990    
5991     return ($self->{ct}); # ATTLIST
5992     redo A;
5993     } elsif ($self->{nc} == 0x0028) { # (
5994     ## XML5: Same as "anything else".
5995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
5996     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
5997    
5998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5999     $self->{line_prev} = $self->{line};
6000     $self->{column_prev} = $self->{column};
6001     $self->{column}++;
6002     $self->{nc}
6003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6004     } else {
6005     $self->{set_nc}->($self);
6006     }
6007    
6008     redo A;
6009     } elsif ($self->{nc} == -1) {
6010     ## XML5: No parse error.
6011     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6012     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6013    
6014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015     $self->{line_prev} = $self->{line};
6016     $self->{column_prev} = $self->{column};
6017     $self->{column}++;
6018     $self->{nc}
6019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020     } else {
6021     $self->{set_nc}->($self);
6022     }
6023    
6024     return ($self->{ct}); # ATTLIST
6025     redo A;
6026     } else {
6027     ## XML5: Not defined yet.
6028     $self->{ca}->{name} .= chr $self->{nc};
6029     ## Stay in the state.
6030    
6031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032     $self->{line_prev} = $self->{line};
6033     $self->{column_prev} = $self->{column};
6034     $self->{column}++;
6035     $self->{nc}
6036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037     } else {
6038     $self->{set_nc}->($self);
6039     }
6040    
6041 wakaba 1.14 redo A;
6042     }
6043 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6044     if ($is_space->{$self->{nc}}) {
6045     ## Stay in the state.
6046    
6047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6048     $self->{line_prev} = $self->{line};
6049     $self->{column_prev} = $self->{column};
6050     $self->{column}++;
6051     $self->{nc}
6052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6053     } else {
6054     $self->{set_nc}->($self);
6055     }
6056    
6057     redo A;
6058     } elsif ($self->{nc} == 0x003E) { # >
6059     ## XML5: Same as "anything else".
6060     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6061     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6062    
6063     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6064     $self->{line_prev} = $self->{line};
6065     $self->{column_prev} = $self->{column};
6066     $self->{column}++;
6067     $self->{nc}
6068     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6069     } else {
6070     $self->{set_nc}->($self);
6071     }
6072    
6073     return ($self->{ct}); # ATTLIST
6074     redo A;
6075     } elsif ($self->{nc} == 0x0028) { # (
6076     ## XML5: Same as "anything else".
6077     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6078    
6079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6080     $self->{line_prev} = $self->{line};
6081     $self->{column_prev} = $self->{column};
6082     $self->{column}++;
6083     $self->{nc}
6084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6085     } else {
6086     $self->{set_nc}->($self);
6087     }
6088    
6089     redo A;
6090     } elsif ($self->{nc} == -1) {
6091     ## XML5: No parse error.
6092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6094    
6095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6096     $self->{line_prev} = $self->{line};
6097     $self->{column_prev} = $self->{column};
6098     $self->{column}++;
6099     $self->{nc}
6100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6101     } else {
6102     $self->{set_nc}->($self);
6103     }
6104    
6105     return ($self->{ct});
6106     redo A;
6107     } else {
6108     ## XML5: Not defined yet.
6109     $self->{ca}->{type} = chr $self->{nc};
6110     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6111    
6112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6113     $self->{line_prev} = $self->{line};
6114     $self->{column_prev} = $self->{column};
6115     $self->{column}++;
6116     $self->{nc}
6117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6118     } else {
6119     $self->{set_nc}->($self);
6120     }
6121    
6122     redo A;
6123     }
6124     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6125     if ($is_space->{$self->{nc}}) {
6126     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6127    
6128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129     $self->{line_prev} = $self->{line};
6130     $self->{column_prev} = $self->{column};
6131     $self->{column}++;
6132     $self->{nc}
6133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134     } else {
6135     $self->{set_nc}->($self);
6136     }
6137    
6138     redo A;
6139     } elsif ($self->{nc} == 0x0023) { # #
6140     ## XML5: Same as "anything else".
6141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6142     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6143    
6144     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6145     $self->{line_prev} = $self->{line};
6146     $self->{column_prev} = $self->{column};
6147     $self->{column}++;
6148     $self->{nc}
6149     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6150     } else {
6151     $self->{set_nc}->($self);
6152     }
6153    
6154     redo A;
6155     } elsif ($self->{nc} == 0x0022) { # "
6156     ## XML5: Same as "anything else".
6157     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6158     $self->{ca}->{value} = '';
6159     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6160    
6161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162     $self->{line_prev} = $self->{line};
6163     $self->{column_prev} = $self->{column};
6164     $self->{column}++;
6165     $self->{nc}
6166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167     } else {
6168     $self->{set_nc}->($self);
6169     }
6170    
6171     redo A;
6172     } elsif ($self->{nc} == 0x0027) { # '
6173     ## XML5: Same as "anything else".
6174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6175     $self->{ca}->{value} = '';
6176     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6177    
6178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6179     $self->{line_prev} = $self->{line};
6180     $self->{column_prev} = $self->{column};
6181     $self->{column}++;
6182     $self->{nc}
6183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6184     } else {
6185     $self->{set_nc}->($self);
6186     }
6187    
6188     redo A;
6189     } elsif ($self->{nc} == 0x003E) { # >
6190     ## XML5: Same as "anything else".
6191     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6193    
6194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6195     $self->{line_prev} = $self->{line};
6196     $self->{column_prev} = $self->{column};
6197     $self->{column}++;
6198     $self->{nc}
6199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6200     } else {
6201     $self->{set_nc}->($self);
6202     }
6203    
6204     return ($self->{ct}); # ATTLIST
6205     redo A;
6206     } elsif ($self->{nc} == 0x0028) { # (
6207     ## XML5: Same as "anything else".
6208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6209     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6210    
6211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6212     $self->{line_prev} = $self->{line};
6213     $self->{column_prev} = $self->{column};
6214     $self->{column}++;
6215     $self->{nc}
6216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6217     } else {
6218     $self->{set_nc}->($self);
6219     }
6220    
6221     redo A;
6222     } elsif ($self->{nc} == -1) {
6223     ## XML5: No parse error.
6224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6226    
6227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6228     $self->{line_prev} = $self->{line};
6229     $self->{column_prev} = $self->{column};
6230     $self->{column}++;
6231     $self->{nc}
6232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6233     } else {
6234     $self->{set_nc}->($self);
6235     }
6236    
6237     return ($self->{ct});
6238     redo A;
6239     } else {
6240     ## XML5: Not defined yet.
6241     $self->{ca}->{type} .= chr $self->{nc};
6242     ## Stay in the state.
6243    
6244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6245     $self->{line_prev} = $self->{line};
6246     $self->{column_prev} = $self->{column};
6247     $self->{column}++;
6248     $self->{nc}
6249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6250     } else {
6251     $self->{set_nc}->($self);
6252     }
6253    
6254     redo A;
6255     }
6256     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6257     if ($is_space->{$self->{nc}}) {
6258     ## Stay in the state.
6259    
6260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6261     $self->{line_prev} = $self->{line};
6262     $self->{column_prev} = $self->{column};
6263     $self->{column}++;
6264     $self->{nc}
6265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6266     } else {
6267     $self->{set_nc}->($self);
6268     }
6269    
6270     redo A;
6271     } elsif ($self->{nc} == 0x0028) { # (
6272     ## XML5: Same as "anything else".
6273     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6274    
6275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6276     $self->{line_prev} = $self->{line};
6277     $self->{column_prev} = $self->{column};
6278     $self->{column}++;
6279     $self->{nc}
6280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6281     } else {
6282     $self->{set_nc}->($self);
6283     }
6284    
6285     redo A;
6286     } elsif ($self->{nc} == 0x0023) { # #
6287     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6288    
6289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6290     $self->{line_prev} = $self->{line};
6291     $self->{column_prev} = $self->{column};
6292     $self->{column}++;
6293     $self->{nc}
6294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6295     } else {
6296     $self->{set_nc}->($self);
6297     }
6298    
6299     redo A;
6300     } elsif ($self->{nc} == 0x0022) { # "
6301     ## XML5: Same as "anything else".
6302     $self->{ca}->{value} = '';
6303     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6304    
6305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6306     $self->{line_prev} = $self->{line};
6307     $self->{column_prev} = $self->{column};
6308     $self->{column}++;
6309     $self->{nc}
6310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6311     } else {
6312     $self->{set_nc}->($self);
6313     }
6314    
6315     redo A;
6316     } elsif ($self->{nc} == 0x0027) { # '
6317     ## XML5: Same as "anything else".
6318     $self->{ca}->{value} = '';
6319     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6320    
6321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6322     $self->{line_prev} = $self->{line};
6323     $self->{column_prev} = $self->{column};
6324     $self->{column}++;
6325     $self->{nc}
6326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6327     } else {
6328     $self->{set_nc}->($self);
6329     }
6330    
6331     redo A;
6332     } elsif ($self->{nc} == 0x003E) { # >
6333     ## XML5: Same as "anything else".
6334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6335     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6336    
6337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6338     $self->{line_prev} = $self->{line};
6339     $self->{column_prev} = $self->{column};
6340     $self->{column}++;
6341     $self->{nc}
6342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6343     } else {
6344     $self->{set_nc}->($self);
6345     }
6346    
6347     return ($self->{ct}); # ATTLIST
6348     redo A;
6349     } elsif ($self->{nc} == -1) {
6350     ## XML5: No parse error.
6351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6352     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6353    
6354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6355     $self->{line_prev} = $self->{line};
6356     $self->{column_prev} = $self->{column};
6357     $self->{column}++;
6358     $self->{nc}
6359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6360     } else {
6361     $self->{set_nc}->($self);
6362     }
6363    
6364     return ($self->{ct});
6365     redo A;
6366     } else {
6367     ## XML5: Switch to the "DOCTYPE bogus comment state".
6368     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6369     $self->{ca}->{value} = '';
6370     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6371     ## Reconsume.
6372     redo A;
6373     }
6374     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6375     if ($is_space->{$self->{nc}}) {
6376     ## Stay in the state.
6377    
6378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6379     $self->{line_prev} = $self->{line};
6380     $self->{column_prev} = $self->{column};
6381     $self->{column}++;
6382     $self->{nc}
6383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6384     } else {
6385     $self->{set_nc}->($self);
6386     }
6387    
6388     redo A;
6389     } elsif ($self->{nc} == 0x007C) { # |
6390     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6391     ## Stay in the state.
6392    
6393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394     $self->{line_prev} = $self->{line};
6395     $self->{column_prev} = $self->{column};
6396     $self->{column}++;
6397     $self->{nc}
6398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399     } else {
6400     $self->{set_nc}->($self);
6401     }
6402    
6403     redo A;
6404     } elsif ($self->{nc} == 0x0029) { # )
6405     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6406     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6407    
6408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409     $self->{line_prev} = $self->{line};
6410     $self->{column_prev} = $self->{column};
6411     $self->{column}++;
6412     $self->{nc}
6413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414     } else {
6415     $self->{set_nc}->($self);
6416     }
6417    
6418     redo A;
6419     } elsif ($self->{nc} == 0x003E) { # >
6420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6421     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6422    
6423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424     $self->{line_prev} = $self->{line};
6425     $self->{column_prev} = $self->{column};
6426     $self->{column}++;
6427     $self->{nc}
6428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429     } else {
6430     $self->{set_nc}->($self);
6431     }
6432    
6433     return ($self->{ct}); # ATTLIST
6434     redo A;
6435     } elsif ($self->{nc} == -1) {
6436     ## XML5: No parse error.
6437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6438     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6439    
6440     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6441     $self->{line_prev} = $self->{line};
6442     $self->{column_prev} = $self->{column};
6443     $self->{column}++;
6444     $self->{nc}
6445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6446     } else {
6447     $self->{set_nc}->($self);
6448     }
6449    
6450     return ($self->{ct});
6451     redo A;
6452     } else {
6453     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6454     $self->{state} = ALLOWED_TOKEN_STATE;
6455    
6456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6457     $self->{line_prev} = $self->{line};
6458     $self->{column_prev} = $self->{column};
6459     $self->{column}++;
6460     $self->{nc}
6461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6462     } else {
6463     $self->{set_nc}->($self);
6464     }
6465    
6466     redo A;
6467     }
6468     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6469     if ($is_space->{$self->{nc}}) {
6470     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6471    
6472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6473     $self->{line_prev} = $self->{line};
6474     $self->{column_prev} = $self->{column};
6475     $self->{column}++;
6476     $self->{nc}
6477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6478     } else {
6479     $self->{set_nc}->($self);
6480     }
6481    
6482     redo A;
6483     } elsif ($self->{nc} == 0x007C) { # |
6484     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6485    
6486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6487     $self->{line_prev} = $self->{line};
6488     $self->{column_prev} = $self->{column};
6489     $self->{column}++;
6490     $self->{nc}
6491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6492     } else {
6493     $self->{set_nc}->($self);
6494     }
6495    
6496     redo A;
6497     } elsif ($self->{nc} == 0x0029) { # )
6498     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6499    
6500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6501     $self->{line_prev} = $self->{line};
6502     $self->{column_prev} = $self->{column};
6503     $self->{column}++;
6504     $self->{nc}
6505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6506     } else {
6507     $self->{set_nc}->($self);
6508     }
6509    
6510     redo A;
6511     } elsif ($self->{nc} == 0x003E) { # >
6512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6514    
6515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6516     $self->{line_prev} = $self->{line};
6517     $self->{column_prev} = $self->{column};
6518     $self->{column}++;
6519     $self->{nc}
6520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6521     } else {
6522     $self->{set_nc}->($self);
6523     }
6524    
6525     return ($self->{ct}); # ATTLIST
6526     redo A;
6527     } elsif ($self->{nc} == -1) {
6528     ## XML5: No parse error.
6529     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6530     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6531    
6532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6533     $self->{line_prev} = $self->{line};
6534     $self->{column_prev} = $self->{column};
6535     $self->{column}++;
6536     $self->{nc}
6537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6538     } else {
6539     $self->{set_nc}->($self);
6540     }
6541    
6542     return ($self->{ct});
6543     redo A;
6544     } else {
6545     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6546     ## Stay in the state.
6547    
6548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6549     $self->{line_prev} = $self->{line};
6550     $self->{column_prev} = $self->{column};
6551     $self->{column}++;
6552     $self->{nc}
6553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6554     } else {
6555     $self->{set_nc}->($self);
6556     }
6557    
6558     redo A;
6559     }
6560     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6561     if ($is_space->{$self->{nc}}) {
6562     ## Stay in the state.
6563    
6564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565     $self->{line_prev} = $self->{line};
6566     $self->{column_prev} = $self->{column};
6567     $self->{column}++;
6568     $self->{nc}
6569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570     } else {
6571     $self->{set_nc}->($self);
6572     }
6573    
6574     redo A;
6575     } elsif ($self->{nc} == 0x007C) { # |
6576     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6577    
6578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579     $self->{line_prev} = $self->{line};
6580     $self->{column_prev} = $self->{column};
6581     $self->{column}++;
6582     $self->{nc}
6583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584     } else {
6585     $self->{set_nc}->($self);
6586     }
6587    
6588     redo A;
6589     } elsif ($self->{nc} == 0x0029) { # )
6590     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6591    
6592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6593     $self->{line_prev} = $self->{line};
6594     $self->{column_prev} = $self->{column};
6595     $self->{column}++;
6596     $self->{nc}
6597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6598     } else {
6599     $self->{set_nc}->($self);
6600     }
6601    
6602     redo A;
6603     } elsif ($self->{nc} == 0x003E) { # >
6604     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6605     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6606    
6607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6608     $self->{line_prev} = $self->{line};
6609     $self->{column_prev} = $self->{column};
6610     $self->{column}++;
6611     $self->{nc}
6612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6613     } else {
6614     $self->{set_nc}->($self);
6615     }
6616    
6617     return ($self->{ct}); # ATTLIST
6618     redo A;
6619     } elsif ($self->{nc} == -1) {
6620     ## XML5: No parse error.
6621     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6622     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6623    
6624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625     $self->{line_prev} = $self->{line};
6626     $self->{column_prev} = $self->{column};
6627     $self->{column}++;
6628     $self->{nc}
6629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630     } else {
6631     $self->{set_nc}->($self);
6632     }
6633    
6634     return ($self->{ct});
6635     redo A;
6636     } else {
6637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6638     line => $self->{line_prev},
6639     column => $self->{column_prev});
6640     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6641     $self->{state} = ALLOWED_TOKEN_STATE;
6642    
6643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6644     $self->{line_prev} = $self->{line};
6645     $self->{column_prev} = $self->{column};
6646     $self->{column}++;
6647     $self->{nc}
6648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6649     } else {
6650     $self->{set_nc}->($self);
6651     }
6652    
6653     redo A;
6654     }
6655     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6656     if ($is_space->{$self->{nc}}) {
6657     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6658    
6659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6660     $self->{line_prev} = $self->{line};
6661     $self->{column_prev} = $self->{column};
6662     $self->{column}++;
6663     $self->{nc}
6664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6665     } else {
6666     $self->{set_nc}->($self);
6667     }
6668    
6669     redo A;
6670     } elsif ($self->{nc} == 0x0023) { # #
6671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6672     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6673    
6674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6675     $self->{line_prev} = $self->{line};
6676     $self->{column_prev} = $self->{column};
6677     $self->{column}++;
6678     $self->{nc}
6679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6680     } else {
6681     $self->{set_nc}->($self);
6682     }
6683    
6684     redo A;
6685     } elsif ($self->{nc} == 0x0022) { # "
6686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6687     $self->{ca}->{value} = '';
6688     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6689    
6690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6691     $self->{line_prev} = $self->{line};
6692     $self->{column_prev} = $self->{column};
6693     $self->{column}++;
6694     $self->{nc}
6695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6696     } else {
6697     $self->{set_nc}->($self);
6698     }
6699    
6700     redo A;
6701     } elsif ($self->{nc} == 0x0027) { # '
6702     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6703     $self->{ca}->{value} = '';
6704     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6705    
6706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6707     $self->{line_prev} = $self->{line};
6708     $self->{column_prev} = $self->{column};
6709     $self->{column}++;
6710     $self->{nc}
6711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6712     } else {
6713     $self->{set_nc}->($self);
6714     }
6715    
6716     redo A;
6717     } elsif ($self->{nc} == 0x003E) { # >
6718     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6719     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6720    
6721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6722     $self->{line_prev} = $self->{line};
6723     $self->{column_prev} = $self->{column};
6724     $self->{column}++;
6725     $self->{nc}
6726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6727     } else {
6728     $self->{set_nc}->($self);
6729     }
6730    
6731     return ($self->{ct}); # ATTLIST
6732     redo A;
6733     } elsif ($self->{nc} == -1) {
6734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6736    
6737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6738     $self->{line_prev} = $self->{line};
6739     $self->{column_prev} = $self->{column};
6740     $self->{column}++;
6741     $self->{nc}
6742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6743     } else {
6744     $self->{set_nc}->($self);
6745     }
6746    
6747     return ($self->{ct});
6748     redo A;
6749     } else {
6750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6751     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6752     ## Reconsume.
6753     redo A;
6754     }
6755     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
6756     if ($is_space->{$self->{nc}}) {
6757     ## Stay in the state.
6758    
6759     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760     $self->{line_prev} = $self->{line};
6761     $self->{column_prev} = $self->{column};
6762     $self->{column}++;
6763     $self->{nc}
6764     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765     } else {
6766     $self->{set_nc}->($self);
6767     }
6768    
6769     redo A;
6770     } elsif ($self->{nc} == 0x0023) { # #
6771     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6772    
6773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6774     $self->{line_prev} = $self->{line};
6775     $self->{column_prev} = $self->{column};
6776     $self->{column}++;
6777     $self->{nc}
6778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6779     } else {
6780     $self->{set_nc}->($self);
6781     }
6782    
6783     redo A;
6784     } elsif ($self->{nc} == 0x0022) { # "
6785     $self->{ca}->{value} = '';
6786     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6787    
6788     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6789     $self->{line_prev} = $self->{line};
6790     $self->{column_prev} = $self->{column};
6791     $self->{column}++;
6792     $self->{nc}
6793     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6794     } else {
6795     $self->{set_nc}->($self);
6796     }
6797    
6798     redo A;
6799     } elsif ($self->{nc} == 0x0027) { # '
6800     $self->{ca}->{value} = '';
6801     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6802    
6803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804     $self->{line_prev} = $self->{line};
6805     $self->{column_prev} = $self->{column};
6806     $self->{column}++;
6807     $self->{nc}
6808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809     } else {
6810     $self->{set_nc}->($self);
6811     }
6812    
6813     redo A;
6814     } elsif ($self->{nc} == 0x003E) { # >
6815     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6816     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6817    
6818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6819     $self->{line_prev} = $self->{line};
6820     $self->{column_prev} = $self->{column};
6821     $self->{column}++;
6822     $self->{nc}
6823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6824     } else {
6825     $self->{set_nc}->($self);
6826     }
6827    
6828     return ($self->{ct}); # ATTLIST
6829     redo A;
6830     } elsif ($self->{nc} == -1) {
6831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6833    
6834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835     $self->{line_prev} = $self->{line};
6836     $self->{column_prev} = $self->{column};
6837     $self->{column}++;
6838     $self->{nc}
6839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840     } else {
6841     $self->{set_nc}->($self);
6842     }
6843    
6844     return ($self->{ct});
6845     redo A;
6846     } else {
6847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6848     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6849     ## Reconsume.
6850     redo A;
6851     }
6852     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
6853     if ($is_space->{$self->{nc}}) {
6854     ## XML5: No parse error.
6855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
6856     $self->{state} = BOGUS_COMMENT_STATE;
6857     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6858     ## Reconsume.
6859     redo A;
6860     } elsif ($self->{nc} == 0x0022) { # "
6861     ## XML5: Same as "anything else".
6862     $self->{ca}->{value} = '';
6863     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6864    
6865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6866     $self->{line_prev} = $self->{line};
6867     $self->{column_prev} = $self->{column};
6868     $self->{column}++;
6869     $self->{nc}
6870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6871     } else {
6872     $self->{set_nc}->($self);
6873     }
6874    
6875     redo A;
6876     } elsif ($self->{nc} == 0x0027) { # '
6877     ## XML5: Same as "anything else".
6878     $self->{ca}->{value} = '';
6879     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6880    
6881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882     $self->{line_prev} = $self->{line};
6883     $self->{column_prev} = $self->{column};
6884     $self->{column}++;
6885     $self->{nc}
6886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887     } else {
6888     $self->{set_nc}->($self);
6889     }
6890    
6891     redo A;
6892     } elsif ($self->{nc} == 0x003E) { # >
6893     ## XML5: Same as "anything else".
6894     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6895     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6896    
6897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6898     $self->{line_prev} = $self->{line};
6899     $self->{column_prev} = $self->{column};
6900     $self->{column}++;
6901     $self->{nc}
6902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6903     } else {
6904     $self->{set_nc}->($self);
6905     }
6906    
6907     return ($self->{ct}); # ATTLIST
6908     redo A;
6909     } elsif ($self->{nc} == -1) {
6910     ## XML5: No parse error.
6911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6912     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6913    
6914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6915     $self->{line_prev} = $self->{line};
6916     $self->{column_prev} = $self->{column};
6917     $self->{column}++;
6918     $self->{nc}
6919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6920     } else {
6921     $self->{set_nc}->($self);
6922     }
6923    
6924     return ($self->{ct});
6925     redo A;
6926     } else {
6927     $self->{ca}->{default} = chr $self->{nc};
6928     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
6929    
6930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6931     $self->{line_prev} = $self->{line};
6932     $self->{column_prev} = $self->{column};
6933     $self->{column}++;
6934     $self->{nc}
6935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6936     } else {
6937     $self->{set_nc}->($self);
6938     }
6939    
6940     redo A;
6941     }
6942     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
6943     if ($is_space->{$self->{nc}}) {
6944     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
6945    
6946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6947     $self->{line_prev} = $self->{line};
6948     $self->{column_prev} = $self->{column};
6949     $self->{column}++;
6950     $self->{nc}
6951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6952     } else {
6953     $self->{set_nc}->($self);
6954     }
6955    
6956     redo A;
6957     } elsif ($self->{nc} == 0x0022) { # "
6958     ## XML5: Same as "anything else".
6959     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6960     $self->{ca}->{value} = '';
6961     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6962    
6963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6964     $self->{line_prev} = $self->{line};
6965     $self->{column_prev} = $self->{column};
6966     $self->{column}++;
6967     $self->{nc}
6968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6969     } else {
6970     $self->{set_nc}->($self);
6971     }
6972    
6973     redo A;
6974     } elsif ($self->{nc} == 0x0027) { # '
6975     ## XML5: Same as "anything else".
6976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6977     $self->{ca}->{value} = '';
6978     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6979    
6980     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6981     $self->{line_prev} = $self->{line};
6982     $self->{column_prev} = $self->{column};
6983     $self->{column}++;
6984     $self->{nc}
6985     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6986     } else {
6987     $self->{set_nc}->($self);
6988     }
6989    
6990     redo A;
6991     } elsif ($self->{nc} == 0x003E) { # >
6992     ## XML5: Same as "anything else".
6993     push @{$self->{ct}->{attrdefs}}, $self->{ca};
6994     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6995    
6996     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6997     $self->{line_prev} = $self->{line};
6998     $self->{column_prev} = $self->{column};
6999     $self->{column}++;
7000     $self->{nc}
7001     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7002     } else {
7003     $self->{set_nc}->($self);
7004     }
7005    
7006     return ($self->{ct}); # ATTLIST
7007     redo A;
7008     } elsif ($self->{nc} == -1) {
7009     ## XML5: No parse error.
7010     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7011     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7012     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7013    
7014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7015     $self->{line_prev} = $self->{line};
7016     $self->{column_prev} = $self->{column};
7017     $self->{column}++;
7018     $self->{nc}
7019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7020     } else {
7021     $self->{set_nc}->($self);
7022     }
7023    
7024     return ($self->{ct});
7025     redo A;
7026     } else {
7027     $self->{ca}->{default} .= chr $self->{nc};
7028     ## Stay in the state.
7029    
7030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7031     $self->{line_prev} = $self->{line};
7032     $self->{column_prev} = $self->{column};
7033     $self->{column}++;
7034     $self->{nc}
7035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7036     } else {
7037     $self->{set_nc}->($self);
7038     }
7039    
7040     redo A;
7041     }
7042     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7043     if ($is_space->{$self->{nc}}) {
7044     ## Stay in the state.
7045    
7046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7047     $self->{line_prev} = $self->{line};
7048     $self->{column_prev} = $self->{column};
7049     $self->{column}++;
7050     $self->{nc}
7051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7052     } else {
7053     $self->{set_nc}->($self);
7054     }
7055    
7056     redo A;
7057     } elsif ($self->{nc} == 0x0022) { # "
7058     $self->{ca}->{value} = '';
7059     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7060    
7061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7062     $self->{line_prev} = $self->{line};
7063     $self->{column_prev} = $self->{column};
7064     $self->{column}++;
7065     $self->{nc}
7066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7067     } else {
7068     $self->{set_nc}->($self);
7069     }
7070    
7071     redo A;
7072     } elsif ($self->{nc} == 0x0027) { # '
7073     $self->{ca}->{value} = '';
7074     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7075    
7076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077     $self->{line_prev} = $self->{line};
7078     $self->{column_prev} = $self->{column};
7079     $self->{column}++;
7080     $self->{nc}
7081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082     } else {
7083     $self->{set_nc}->($self);
7084     }
7085    
7086     redo A;
7087     } elsif ($self->{nc} == 0x003E) { # >
7088     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7089     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7090    
7091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092     $self->{line_prev} = $self->{line};
7093     $self->{column_prev} = $self->{column};
7094     $self->{column}++;
7095     $self->{nc}
7096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097     } else {
7098     $self->{set_nc}->($self);
7099     }
7100    
7101     return ($self->{ct}); # ATTLIST
7102     redo A;
7103     } elsif ($self->{nc} == -1) {
7104     ## XML5: No parse error.
7105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7106     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7107     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7108    
7109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7110     $self->{line_prev} = $self->{line};
7111     $self->{column_prev} = $self->{column};
7112     $self->{column}++;
7113     $self->{nc}
7114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7115     } else {
7116     $self->{set_nc}->($self);
7117     }
7118    
7119     return ($self->{ct});
7120     redo A;
7121     } else {
7122     ## XML5: Not defined yet.
7123     if ($self->{ca}->{default} eq 'FIXED') {
7124     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7125     } else {
7126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7127     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7128     }
7129     ## Reconsume.
7130     redo A;
7131     }
7132     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7133     if ($is_space->{$self->{nc}} or
7134     $self->{nc} == -1 or
7135     $self->{nc} == 0x003E) { # >
7136     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7137     ## Reconsume.
7138     redo A;
7139     } else {
7140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7141     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7142     ## Reconsume.
7143     redo A;
7144     }
7145 wakaba 1.1 } else {
7146     die "$0: $self->{state}: Unknown state";
7147     }
7148     } # A
7149    
7150     die "$0: _get_next_token: unexpected case";
7151     } # _get_next_token
7152    
7153     1;
7154 wakaba 1.15 ## $Date: 2008/10/17 07:14:29 $
7155    

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24