/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.30 - (hide annotations) (download)
Sun Aug 16 05:24:47 2009 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.29: +13 -6 lines
++ whatpm/t/ChangeLog	16 Aug 2009 05:21:53 -0000
	* tokenizer-test-1.test: "<" in attribute names are now parse
	errors (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	16 Aug 2009 05:23:17 -0000
	* Tokenizer.pm.src: Any "<" character in attribute names become
	parse error (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.30 our $VERSION=do{my @r=(q$Revision: 1.29 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251 wakaba 1.30 0x003C => 1, # <
1252 wakaba 1.1 0x003D => 1, # =
1253     }->{$self->{nc}}) {
1254    
1255 wakaba 1.11 ## XML5: Not a parse error.
1256 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1257     } else {
1258    
1259 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1260 wakaba 1.1 }
1261     $self->{ca}
1262     = {name => chr ($self->{nc}),
1263     value => '',
1264     line => $self->{line}, column => $self->{column}};
1265     $self->{state} = ATTRIBUTE_NAME_STATE;
1266    
1267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1268     $self->{line_prev} = $self->{line};
1269     $self->{column_prev} = $self->{column};
1270     $self->{column}++;
1271     $self->{nc}
1272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1273     } else {
1274     $self->{set_nc}->($self);
1275     }
1276    
1277     redo A;
1278     }
1279     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1280 wakaba 1.11 ## XML5: "Tag attribute name state".
1281    
1282 wakaba 1.1 my $before_leave = sub {
1283     if (exists $self->{ct}->{attributes} # start tag or end tag
1284     ->{$self->{ca}->{name}}) { # MUST
1285    
1286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1287     ## Discard $self->{ca} # MUST
1288     } else {
1289    
1290     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1291     = $self->{ca};
1292 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1293 wakaba 1.1 }
1294     }; # $before_leave
1295    
1296     if ($is_space->{$self->{nc}}) {
1297    
1298     $before_leave->();
1299     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1300    
1301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1302     $self->{line_prev} = $self->{line};
1303     $self->{column_prev} = $self->{column};
1304     $self->{column}++;
1305     $self->{nc}
1306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1307     } else {
1308     $self->{set_nc}->($self);
1309     }
1310    
1311     redo A;
1312     } elsif ($self->{nc} == 0x003D) { # =
1313    
1314     $before_leave->();
1315     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1316    
1317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1318     $self->{line_prev} = $self->{line};
1319     $self->{column_prev} = $self->{column};
1320     $self->{column}++;
1321     $self->{nc}
1322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1323     } else {
1324     $self->{set_nc}->($self);
1325     }
1326    
1327     redo A;
1328     } elsif ($self->{nc} == 0x003E) { # >
1329 wakaba 1.11 if ($self->{is_xml}) {
1330    
1331     ## XML5: Not a parse error.
1332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333     } else {
1334    
1335     }
1336    
1337 wakaba 1.1 $before_leave->();
1338     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1339    
1340     $self->{last_stag_name} = $self->{ct}->{tag_name};
1341     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1342    
1343     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1344     if ($self->{ct}->{attributes}) {
1345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1346     }
1347     } else {
1348     die "$0: $self->{ct}->{type}: Unknown token type";
1349     }
1350     $self->{state} = DATA_STATE;
1351 wakaba 1.5 $self->{s_kwd} = '';
1352 wakaba 1.1
1353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1354     $self->{line_prev} = $self->{line};
1355     $self->{column_prev} = $self->{column};
1356     $self->{column}++;
1357     $self->{nc}
1358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1359     } else {
1360     $self->{set_nc}->($self);
1361     }
1362    
1363    
1364     return ($self->{ct}); # start tag or end tag
1365    
1366     redo A;
1367     } elsif (0x0041 <= $self->{nc} and
1368     $self->{nc} <= 0x005A) { # A..Z
1369    
1370 wakaba 1.4 $self->{ca}->{name}
1371     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1372 wakaba 1.1 ## Stay in the state
1373    
1374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1375     $self->{line_prev} = $self->{line};
1376     $self->{column_prev} = $self->{column};
1377     $self->{column}++;
1378     $self->{nc}
1379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1380     } else {
1381     $self->{set_nc}->($self);
1382     }
1383    
1384     redo A;
1385     } elsif ($self->{nc} == 0x002F) { # /
1386 wakaba 1.11 if ($self->{is_xml}) {
1387    
1388     ## XML5: Not a parse error.
1389     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1390     } else {
1391    
1392     }
1393 wakaba 1.1
1394     $before_leave->();
1395     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1396    
1397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1398     $self->{line_prev} = $self->{line};
1399     $self->{column_prev} = $self->{column};
1400     $self->{column}++;
1401     $self->{nc}
1402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1403     } else {
1404     $self->{set_nc}->($self);
1405     }
1406    
1407     redo A;
1408     } elsif ($self->{nc} == -1) {
1409     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1410     $before_leave->();
1411     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1412    
1413     $self->{last_stag_name} = $self->{ct}->{tag_name};
1414     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1415     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1416     if ($self->{ct}->{attributes}) {
1417    
1418     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1419     } else {
1420     ## NOTE: This state should never be reached.
1421    
1422     }
1423     } else {
1424     die "$0: $self->{ct}->{type}: Unknown token type";
1425     }
1426     $self->{state} = DATA_STATE;
1427 wakaba 1.5 $self->{s_kwd} = '';
1428 wakaba 1.1 # reconsume
1429    
1430     return ($self->{ct}); # start tag or end tag
1431    
1432     redo A;
1433     } else {
1434 wakaba 1.30 if ({
1435     0x0022 => 1, # "
1436     0x0027 => 1, # '
1437     0x003C => 1, # <
1438     }->{$self->{nc}}) {
1439 wakaba 1.1
1440 wakaba 1.11 ## XML5: Not a parse error.
1441 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1442     } else {
1443    
1444     }
1445     $self->{ca}->{name} .= chr ($self->{nc});
1446     ## Stay in the state
1447    
1448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1449     $self->{line_prev} = $self->{line};
1450     $self->{column_prev} = $self->{column};
1451     $self->{column}++;
1452     $self->{nc}
1453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1454     } else {
1455     $self->{set_nc}->($self);
1456     }
1457    
1458     redo A;
1459     }
1460     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1461 wakaba 1.11 ## XML5: "Tag attribute name after state".
1462    
1463 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1464    
1465     ## Stay in the state
1466    
1467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1468     $self->{line_prev} = $self->{line};
1469     $self->{column_prev} = $self->{column};
1470     $self->{column}++;
1471     $self->{nc}
1472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1473     } else {
1474     $self->{set_nc}->($self);
1475     }
1476    
1477     redo A;
1478     } elsif ($self->{nc} == 0x003D) { # =
1479    
1480     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1481    
1482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1483     $self->{line_prev} = $self->{line};
1484     $self->{column_prev} = $self->{column};
1485     $self->{column}++;
1486     $self->{nc}
1487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1488     } else {
1489     $self->{set_nc}->($self);
1490     }
1491    
1492     redo A;
1493     } elsif ($self->{nc} == 0x003E) { # >
1494 wakaba 1.11 if ($self->{is_xml}) {
1495    
1496     ## XML5: Not a parse error.
1497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1498     } else {
1499    
1500     }
1501    
1502 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1503    
1504     $self->{last_stag_name} = $self->{ct}->{tag_name};
1505     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1506     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1507     if ($self->{ct}->{attributes}) {
1508    
1509     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1510     } else {
1511     ## NOTE: This state should never be reached.
1512    
1513     }
1514     } else {
1515     die "$0: $self->{ct}->{type}: Unknown token type";
1516     }
1517     $self->{state} = DATA_STATE;
1518 wakaba 1.5 $self->{s_kwd} = '';
1519 wakaba 1.1
1520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1521     $self->{line_prev} = $self->{line};
1522     $self->{column_prev} = $self->{column};
1523     $self->{column}++;
1524     $self->{nc}
1525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1526     } else {
1527     $self->{set_nc}->($self);
1528     }
1529    
1530    
1531     return ($self->{ct}); # start tag or end tag
1532    
1533     redo A;
1534     } elsif (0x0041 <= $self->{nc} and
1535     $self->{nc} <= 0x005A) { # A..Z
1536    
1537     $self->{ca}
1538 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1539 wakaba 1.1 value => '',
1540     line => $self->{line}, column => $self->{column}};
1541     $self->{state} = ATTRIBUTE_NAME_STATE;
1542    
1543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544     $self->{line_prev} = $self->{line};
1545     $self->{column_prev} = $self->{column};
1546     $self->{column}++;
1547     $self->{nc}
1548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549     } else {
1550     $self->{set_nc}->($self);
1551     }
1552    
1553     redo A;
1554     } elsif ($self->{nc} == 0x002F) { # /
1555 wakaba 1.11 if ($self->{is_xml}) {
1556    
1557     ## XML5: Not a parse error.
1558     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1559     } else {
1560    
1561     }
1562 wakaba 1.1
1563     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1564    
1565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1566     $self->{line_prev} = $self->{line};
1567     $self->{column_prev} = $self->{column};
1568     $self->{column}++;
1569     $self->{nc}
1570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1571     } else {
1572     $self->{set_nc}->($self);
1573     }
1574    
1575     redo A;
1576     } elsif ($self->{nc} == -1) {
1577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1578     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1579    
1580     $self->{last_stag_name} = $self->{ct}->{tag_name};
1581     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1582     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1583     if ($self->{ct}->{attributes}) {
1584    
1585     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1586     } else {
1587     ## NOTE: This state should never be reached.
1588    
1589     }
1590     } else {
1591     die "$0: $self->{ct}->{type}: Unknown token type";
1592     }
1593 wakaba 1.5 $self->{s_kwd} = '';
1594 wakaba 1.1 $self->{state} = DATA_STATE;
1595     # reconsume
1596    
1597     return ($self->{ct}); # start tag or end tag
1598    
1599     redo A;
1600     } else {
1601 wakaba 1.11 if ($self->{is_xml}) {
1602    
1603     ## XML5: Not a parse error.
1604     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1605     } else {
1606    
1607     }
1608    
1609 wakaba 1.30 if ({
1610     0x0022 => 1, # "
1611     0x0027 => 1, # '
1612     0x003C => 1, # <
1613     }->{$self->{nc}}) {
1614 wakaba 1.1
1615 wakaba 1.11 ## XML5: Not a parse error.
1616 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1617     } else {
1618    
1619     }
1620     $self->{ca}
1621     = {name => chr ($self->{nc}),
1622     value => '',
1623     line => $self->{line}, column => $self->{column}};
1624     $self->{state} = ATTRIBUTE_NAME_STATE;
1625    
1626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1627     $self->{line_prev} = $self->{line};
1628     $self->{column_prev} = $self->{column};
1629     $self->{column}++;
1630     $self->{nc}
1631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1632     } else {
1633     $self->{set_nc}->($self);
1634     }
1635    
1636     redo A;
1637     }
1638     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1639 wakaba 1.11 ## XML5: "Tag attribute value before state".
1640    
1641 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1642    
1643     ## Stay in the state
1644    
1645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1646     $self->{line_prev} = $self->{line};
1647     $self->{column_prev} = $self->{column};
1648     $self->{column}++;
1649     $self->{nc}
1650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1651     } else {
1652     $self->{set_nc}->($self);
1653     }
1654    
1655     redo A;
1656     } elsif ($self->{nc} == 0x0022) { # "
1657    
1658     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1659    
1660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1661     $self->{line_prev} = $self->{line};
1662     $self->{column_prev} = $self->{column};
1663     $self->{column}++;
1664     $self->{nc}
1665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1666     } else {
1667     $self->{set_nc}->($self);
1668     }
1669    
1670     redo A;
1671     } elsif ($self->{nc} == 0x0026) { # &
1672    
1673     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1674     ## reconsume
1675     redo A;
1676     } elsif ($self->{nc} == 0x0027) { # '
1677    
1678     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1679    
1680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1681     $self->{line_prev} = $self->{line};
1682     $self->{column_prev} = $self->{column};
1683     $self->{column}++;
1684     $self->{nc}
1685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1686     } else {
1687     $self->{set_nc}->($self);
1688     }
1689    
1690     redo A;
1691     } elsif ($self->{nc} == 0x003E) { # >
1692     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1693     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1694    
1695     $self->{last_stag_name} = $self->{ct}->{tag_name};
1696     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1697     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1698     if ($self->{ct}->{attributes}) {
1699    
1700     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1701     } else {
1702     ## NOTE: This state should never be reached.
1703    
1704     }
1705     } else {
1706     die "$0: $self->{ct}->{type}: Unknown token type";
1707     }
1708     $self->{state} = DATA_STATE;
1709 wakaba 1.5 $self->{s_kwd} = '';
1710 wakaba 1.1
1711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1712     $self->{line_prev} = $self->{line};
1713     $self->{column_prev} = $self->{column};
1714     $self->{column}++;
1715     $self->{nc}
1716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1717     } else {
1718     $self->{set_nc}->($self);
1719     }
1720    
1721    
1722     return ($self->{ct}); # start tag or end tag
1723    
1724     redo A;
1725     } elsif ($self->{nc} == -1) {
1726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1727     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1728    
1729     $self->{last_stag_name} = $self->{ct}->{tag_name};
1730     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1731     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1732     if ($self->{ct}->{attributes}) {
1733    
1734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1735     } else {
1736     ## NOTE: This state should never be reached.
1737    
1738     }
1739     } else {
1740     die "$0: $self->{ct}->{type}: Unknown token type";
1741     }
1742     $self->{state} = DATA_STATE;
1743 wakaba 1.5 $self->{s_kwd} = '';
1744 wakaba 1.1 ## reconsume
1745    
1746     return ($self->{ct}); # start tag or end tag
1747    
1748     redo A;
1749     } else {
1750 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1751 wakaba 1.1
1752 wakaba 1.11 ## XML5: Not a parse error.
1753 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1754 wakaba 1.11 } elsif ($self->{is_xml}) {
1755    
1756     ## XML5: No parse error.
1757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1758 wakaba 1.1 } else {
1759    
1760     }
1761     $self->{ca}->{value} .= chr ($self->{nc});
1762     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1763    
1764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1765     $self->{line_prev} = $self->{line};
1766     $self->{column_prev} = $self->{column};
1767     $self->{column}++;
1768     $self->{nc}
1769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1770     } else {
1771     $self->{set_nc}->($self);
1772     }
1773    
1774     redo A;
1775     }
1776     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1777 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1778     ## ATTLIST attribute value double quoted state".
1779 wakaba 1.11
1780 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1781 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1782    
1783     ## XML5: "DOCTYPE ATTLIST name after state".
1784     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1785     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1786     } else {
1787    
1788     ## XML5: "Tag attribute name before state".
1789     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1790     }
1791 wakaba 1.1
1792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793     $self->{line_prev} = $self->{line};
1794     $self->{column_prev} = $self->{column};
1795     $self->{column}++;
1796     $self->{nc}
1797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1798     } else {
1799     $self->{set_nc}->($self);
1800     }
1801    
1802     redo A;
1803     } elsif ($self->{nc} == 0x0026) { # &
1804    
1805 wakaba 1.11 ## XML5: Not defined yet.
1806    
1807 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1808     ## "entity in attribute value state". In this implementation, the
1809     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1810     ## implementation of the "consume a character reference" algorithm.
1811     $self->{prev_state} = $self->{state};
1812     $self->{entity_add} = 0x0022; # "
1813     $self->{state} = ENTITY_STATE;
1814    
1815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1816     $self->{line_prev} = $self->{line};
1817     $self->{column_prev} = $self->{column};
1818     $self->{column}++;
1819     $self->{nc}
1820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1821     } else {
1822     $self->{set_nc}->($self);
1823     }
1824    
1825     redo A;
1826 wakaba 1.25 } elsif ($self->{is_xml} and
1827     $is_space->{$self->{nc}}) {
1828    
1829     $self->{ca}->{value} .= ' ';
1830     ## Stay in the state.
1831    
1832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833     $self->{line_prev} = $self->{line};
1834     $self->{column_prev} = $self->{column};
1835     $self->{column}++;
1836     $self->{nc}
1837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838     } else {
1839     $self->{set_nc}->($self);
1840     }
1841    
1842     redo A;
1843 wakaba 1.1 } elsif ($self->{nc} == -1) {
1844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1846    
1847     $self->{last_stag_name} = $self->{ct}->{tag_name};
1848 wakaba 1.15
1849     $self->{state} = DATA_STATE;
1850     $self->{s_kwd} = '';
1851     ## reconsume
1852     return ($self->{ct}); # start tag
1853     redo A;
1854 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1855     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1856     if ($self->{ct}->{attributes}) {
1857    
1858     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1859     } else {
1860     ## NOTE: This state should never be reached.
1861    
1862     }
1863 wakaba 1.15
1864     $self->{state} = DATA_STATE;
1865     $self->{s_kwd} = '';
1866     ## reconsume
1867     return ($self->{ct}); # end tag
1868     redo A;
1869     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1870     ## XML5: No parse error above; not defined yet.
1871     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1872     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1873     ## Reconsume.
1874     return ($self->{ct}); # ATTLIST
1875     redo A;
1876 wakaba 1.1 } else {
1877     die "$0: $self->{ct}->{type}: Unknown token type";
1878     }
1879     } else {
1880 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1881 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1882    
1883     ## XML5: Not a parse error.
1884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1885     } else {
1886    
1887     }
1888 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1889     $self->{read_until}->($self->{ca}->{value},
1890 wakaba 1.25 qq["&<\x09\x0C\x20],
1891 wakaba 1.1 length $self->{ca}->{value});
1892    
1893     ## Stay in the state
1894    
1895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1896     $self->{line_prev} = $self->{line};
1897     $self->{column_prev} = $self->{column};
1898     $self->{column}++;
1899     $self->{nc}
1900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1901     } else {
1902     $self->{set_nc}->($self);
1903     }
1904    
1905     redo A;
1906     }
1907     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1908 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1909     ## ATTLIST attribute value single quoted state".
1910 wakaba 1.11
1911 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1912 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1913    
1914     ## XML5: "DOCTYPE ATTLIST name after state".
1915     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1916     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1917     } else {
1918    
1919     ## XML5: "Before attribute name state" (sic).
1920     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1921     }
1922 wakaba 1.1
1923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1924     $self->{line_prev} = $self->{line};
1925     $self->{column_prev} = $self->{column};
1926     $self->{column}++;
1927     $self->{nc}
1928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1929     } else {
1930     $self->{set_nc}->($self);
1931     }
1932    
1933     redo A;
1934     } elsif ($self->{nc} == 0x0026) { # &
1935    
1936 wakaba 1.11 ## XML5: Not defined yet.
1937    
1938 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1939     ## "entity in attribute value state". In this implementation, the
1940     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1941     ## implementation of the "consume a character reference" algorithm.
1942     $self->{entity_add} = 0x0027; # '
1943     $self->{prev_state} = $self->{state};
1944     $self->{state} = ENTITY_STATE;
1945    
1946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1947     $self->{line_prev} = $self->{line};
1948     $self->{column_prev} = $self->{column};
1949     $self->{column}++;
1950     $self->{nc}
1951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1952     } else {
1953     $self->{set_nc}->($self);
1954     }
1955    
1956     redo A;
1957 wakaba 1.25 } elsif ($self->{is_xml} and
1958     $is_space->{$self->{nc}}) {
1959    
1960     $self->{ca}->{value} .= ' ';
1961     ## Stay in the state.
1962    
1963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964     $self->{line_prev} = $self->{line};
1965     $self->{column_prev} = $self->{column};
1966     $self->{column}++;
1967     $self->{nc}
1968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969     } else {
1970     $self->{set_nc}->($self);
1971     }
1972    
1973     redo A;
1974 wakaba 1.1 } elsif ($self->{nc} == -1) {
1975     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1976     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1977    
1978     $self->{last_stag_name} = $self->{ct}->{tag_name};
1979 wakaba 1.15
1980     $self->{state} = DATA_STATE;
1981     $self->{s_kwd} = '';
1982     ## reconsume
1983     return ($self->{ct}); # start tag
1984     redo A;
1985 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1986     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1987     if ($self->{ct}->{attributes}) {
1988    
1989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1990     } else {
1991     ## NOTE: This state should never be reached.
1992    
1993     }
1994 wakaba 1.15
1995     $self->{state} = DATA_STATE;
1996     $self->{s_kwd} = '';
1997     ## reconsume
1998     return ($self->{ct}); # end tag
1999     redo A;
2000     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2001     ## XML5: No parse error above; not defined yet.
2002     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2003     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2004     ## Reconsume.
2005     return ($self->{ct}); # ATTLIST
2006     redo A;
2007 wakaba 1.1 } else {
2008     die "$0: $self->{ct}->{type}: Unknown token type";
2009     }
2010     } else {
2011 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2012 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2013    
2014     ## XML5: Not a parse error.
2015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2016     } else {
2017    
2018     }
2019 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2020     $self->{read_until}->($self->{ca}->{value},
2021 wakaba 1.25 qq['&<\x09\x0C\x20],
2022 wakaba 1.1 length $self->{ca}->{value});
2023    
2024     ## Stay in the state
2025    
2026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2027     $self->{line_prev} = $self->{line};
2028     $self->{column_prev} = $self->{column};
2029     $self->{column}++;
2030     $self->{nc}
2031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2032     } else {
2033     $self->{set_nc}->($self);
2034     }
2035    
2036     redo A;
2037     }
2038     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2039 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2040    
2041 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2042 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2043    
2044     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2045     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2046     } else {
2047    
2048     ## XML5: "Tag attribute name before state".
2049     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2050     }
2051 wakaba 1.1
2052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2053     $self->{line_prev} = $self->{line};
2054     $self->{column_prev} = $self->{column};
2055     $self->{column}++;
2056     $self->{nc}
2057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2058     } else {
2059     $self->{set_nc}->($self);
2060     }
2061    
2062     redo A;
2063     } elsif ($self->{nc} == 0x0026) { # &
2064    
2065 wakaba 1.11
2066     ## XML5: Not defined yet.
2067    
2068 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2069     ## "entity in attribute value state". In this implementation, the
2070     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2071     ## implementation of the "consume a character reference" algorithm.
2072     $self->{entity_add} = -1;
2073     $self->{prev_state} = $self->{state};
2074     $self->{state} = ENTITY_STATE;
2075    
2076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2077     $self->{line_prev} = $self->{line};
2078     $self->{column_prev} = $self->{column};
2079     $self->{column}++;
2080     $self->{nc}
2081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2082     } else {
2083     $self->{set_nc}->($self);
2084     }
2085    
2086     redo A;
2087     } elsif ($self->{nc} == 0x003E) { # >
2088     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2089    
2090     $self->{last_stag_name} = $self->{ct}->{tag_name};
2091 wakaba 1.15
2092     $self->{state} = DATA_STATE;
2093     $self->{s_kwd} = '';
2094    
2095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096     $self->{line_prev} = $self->{line};
2097     $self->{column_prev} = $self->{column};
2098     $self->{column}++;
2099     $self->{nc}
2100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101     } else {
2102     $self->{set_nc}->($self);
2103     }
2104    
2105     return ($self->{ct}); # start tag
2106     redo A;
2107 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2108     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2109     if ($self->{ct}->{attributes}) {
2110    
2111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2112     } else {
2113     ## NOTE: This state should never be reached.
2114    
2115     }
2116 wakaba 1.15
2117     $self->{state} = DATA_STATE;
2118     $self->{s_kwd} = '';
2119    
2120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2121     $self->{line_prev} = $self->{line};
2122     $self->{column_prev} = $self->{column};
2123     $self->{column}++;
2124     $self->{nc}
2125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2126     } else {
2127     $self->{set_nc}->($self);
2128     }
2129    
2130     return ($self->{ct}); # end tag
2131     redo A;
2132     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2133     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2134     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135    
2136 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2137     $self->{line_prev} = $self->{line};
2138     $self->{column_prev} = $self->{column};
2139     $self->{column}++;
2140     $self->{nc}
2141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2142     } else {
2143     $self->{set_nc}->($self);
2144     }
2145    
2146 wakaba 1.15 return ($self->{ct}); # ATTLIST
2147     redo A;
2148     } else {
2149     die "$0: $self->{ct}->{type}: Unknown token type";
2150     }
2151 wakaba 1.1 } elsif ($self->{nc} == -1) {
2152     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2153    
2154 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2155 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2156 wakaba 1.15
2157     $self->{state} = DATA_STATE;
2158     $self->{s_kwd} = '';
2159     ## reconsume
2160     return ($self->{ct}); # start tag
2161     redo A;
2162 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2163 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2164 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2165     if ($self->{ct}->{attributes}) {
2166    
2167     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2168     } else {
2169     ## NOTE: This state should never be reached.
2170    
2171     }
2172 wakaba 1.15
2173     $self->{state} = DATA_STATE;
2174     $self->{s_kwd} = '';
2175     ## reconsume
2176     return ($self->{ct}); # end tag
2177     redo A;
2178     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2179     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2180     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2181     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2182     ## Reconsume.
2183     return ($self->{ct}); # ATTLIST
2184     redo A;
2185 wakaba 1.1 } else {
2186     die "$0: $self->{ct}->{type}: Unknown token type";
2187     }
2188     } else {
2189     if ({
2190     0x0022 => 1, # "
2191     0x0027 => 1, # '
2192     0x003D => 1, # =
2193 wakaba 1.26 0x003C => 1, # <
2194 wakaba 1.1 }->{$self->{nc}}) {
2195    
2196 wakaba 1.11 ## XML5: Not a parse error.
2197 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2198     } else {
2199    
2200     }
2201     $self->{ca}->{value} .= chr ($self->{nc});
2202     $self->{read_until}->($self->{ca}->{value},
2203 wakaba 1.25 qq["'=& \x09\x0C>],
2204 wakaba 1.1 length $self->{ca}->{value});
2205    
2206     ## Stay in the state
2207    
2208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2209     $self->{line_prev} = $self->{line};
2210     $self->{column_prev} = $self->{column};
2211     $self->{column}++;
2212     $self->{nc}
2213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2214     } else {
2215     $self->{set_nc}->($self);
2216     }
2217    
2218     redo A;
2219     }
2220     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2221     if ($is_space->{$self->{nc}}) {
2222    
2223     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2224    
2225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2226     $self->{line_prev} = $self->{line};
2227     $self->{column_prev} = $self->{column};
2228     $self->{column}++;
2229     $self->{nc}
2230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2231     } else {
2232     $self->{set_nc}->($self);
2233     }
2234    
2235     redo A;
2236     } elsif ($self->{nc} == 0x003E) { # >
2237     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2238    
2239     $self->{last_stag_name} = $self->{ct}->{tag_name};
2240     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2241     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2242     if ($self->{ct}->{attributes}) {
2243    
2244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2245     } else {
2246     ## NOTE: This state should never be reached.
2247    
2248     }
2249     } else {
2250     die "$0: $self->{ct}->{type}: Unknown token type";
2251     }
2252     $self->{state} = DATA_STATE;
2253 wakaba 1.5 $self->{s_kwd} = '';
2254 wakaba 1.1
2255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256     $self->{line_prev} = $self->{line};
2257     $self->{column_prev} = $self->{column};
2258     $self->{column}++;
2259     $self->{nc}
2260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261     } else {
2262     $self->{set_nc}->($self);
2263     }
2264    
2265    
2266     return ($self->{ct}); # start tag or end tag
2267    
2268     redo A;
2269     } elsif ($self->{nc} == 0x002F) { # /
2270    
2271     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2272    
2273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2274     $self->{line_prev} = $self->{line};
2275     $self->{column_prev} = $self->{column};
2276     $self->{column}++;
2277     $self->{nc}
2278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2279     } else {
2280     $self->{set_nc}->($self);
2281     }
2282    
2283     redo A;
2284     } elsif ($self->{nc} == -1) {
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2286     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2287    
2288     $self->{last_stag_name} = $self->{ct}->{tag_name};
2289     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2290     if ($self->{ct}->{attributes}) {
2291    
2292     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2293     } else {
2294     ## NOTE: This state should never be reached.
2295    
2296     }
2297     } else {
2298     die "$0: $self->{ct}->{type}: Unknown token type";
2299     }
2300     $self->{state} = DATA_STATE;
2301 wakaba 1.5 $self->{s_kwd} = '';
2302 wakaba 1.1 ## Reconsume.
2303     return ($self->{ct}); # start tag or end tag
2304     redo A;
2305     } else {
2306    
2307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2308     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2309     ## reconsume
2310     redo A;
2311     }
2312     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2313 wakaba 1.11 ## XML5: "Empty tag state".
2314    
2315 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2316     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2317    
2318     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2319     ## TODO: Different type than slash in start tag
2320     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2321     if ($self->{ct}->{attributes}) {
2322    
2323     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2324     } else {
2325    
2326     }
2327     ## TODO: Test |<title></title/>|
2328     } else {
2329    
2330     $self->{self_closing} = 1;
2331     }
2332    
2333     $self->{state} = DATA_STATE;
2334 wakaba 1.5 $self->{s_kwd} = '';
2335 wakaba 1.1
2336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2337     $self->{line_prev} = $self->{line};
2338     $self->{column_prev} = $self->{column};
2339     $self->{column}++;
2340     $self->{nc}
2341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2342     } else {
2343     $self->{set_nc}->($self);
2344     }
2345    
2346    
2347     return ($self->{ct}); # start tag or end tag
2348    
2349     redo A;
2350     } elsif ($self->{nc} == -1) {
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2352     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2353    
2354     $self->{last_stag_name} = $self->{ct}->{tag_name};
2355     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2356     if ($self->{ct}->{attributes}) {
2357    
2358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2359     } else {
2360     ## NOTE: This state should never be reached.
2361    
2362     }
2363     } else {
2364     die "$0: $self->{ct}->{type}: Unknown token type";
2365     }
2366 wakaba 1.11 ## XML5: "Tag attribute name before state".
2367 wakaba 1.1 $self->{state} = DATA_STATE;
2368 wakaba 1.5 $self->{s_kwd} = '';
2369 wakaba 1.1 ## Reconsume.
2370     return ($self->{ct}); # start tag or end tag
2371     redo A;
2372     } else {
2373    
2374     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2375     ## TODO: This error type is wrong.
2376     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2377     ## Reconsume.
2378     redo A;
2379     }
2380     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2381 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2382    
2383 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2384     ## consumes characters one-by-one basis.
2385    
2386     if ($self->{nc} == 0x003E) { # >
2387 wakaba 1.13 if ($self->{in_subset}) {
2388    
2389     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2390     } else {
2391    
2392     $self->{state} = DATA_STATE;
2393     $self->{s_kwd} = '';
2394     }
2395 wakaba 1.1
2396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2397     $self->{line_prev} = $self->{line};
2398     $self->{column_prev} = $self->{column};
2399     $self->{column}++;
2400     $self->{nc}
2401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2402     } else {
2403     $self->{set_nc}->($self);
2404     }
2405    
2406    
2407     return ($self->{ct}); # comment
2408     redo A;
2409     } elsif ($self->{nc} == -1) {
2410 wakaba 1.13 if ($self->{in_subset}) {
2411    
2412     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2413     } else {
2414    
2415     $self->{state} = DATA_STATE;
2416     $self->{s_kwd} = '';
2417     }
2418 wakaba 1.1 ## reconsume
2419    
2420     return ($self->{ct}); # comment
2421     redo A;
2422     } else {
2423    
2424     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2425     $self->{read_until}->($self->{ct}->{data},
2426     q[>],
2427     length $self->{ct}->{data});
2428    
2429     ## Stay in the state.
2430    
2431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2432     $self->{line_prev} = $self->{line};
2433     $self->{column_prev} = $self->{column};
2434     $self->{column}++;
2435     $self->{nc}
2436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2437     } else {
2438     $self->{set_nc}->($self);
2439     }
2440    
2441     redo A;
2442     }
2443     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2444 wakaba 1.14 ## XML5: "Markup declaration state".
2445 wakaba 1.1
2446     if ($self->{nc} == 0x002D) { # -
2447    
2448     $self->{state} = MD_HYPHEN_STATE;
2449    
2450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2451     $self->{line_prev} = $self->{line};
2452     $self->{column_prev} = $self->{column};
2453     $self->{column}++;
2454     $self->{nc}
2455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2456     } else {
2457     $self->{set_nc}->($self);
2458     }
2459    
2460     redo A;
2461     } elsif ($self->{nc} == 0x0044 or # D
2462     $self->{nc} == 0x0064) { # d
2463     ## ASCII case-insensitive.
2464    
2465     $self->{state} = MD_DOCTYPE_STATE;
2466 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2467 wakaba 1.1
2468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2469     $self->{line_prev} = $self->{line};
2470     $self->{column_prev} = $self->{column};
2471     $self->{column}++;
2472     $self->{nc}
2473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2474     } else {
2475     $self->{set_nc}->($self);
2476     }
2477    
2478     redo A;
2479 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2480     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2481     $self->{is_xml}) and
2482 wakaba 1.1 $self->{nc} == 0x005B) { # [
2483    
2484     $self->{state} = MD_CDATA_STATE;
2485 wakaba 1.12 $self->{kwd} = '[';
2486 wakaba 1.1
2487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488     $self->{line_prev} = $self->{line};
2489     $self->{column_prev} = $self->{column};
2490     $self->{column}++;
2491     $self->{nc}
2492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493     } else {
2494     $self->{set_nc}->($self);
2495     }
2496    
2497     redo A;
2498     } else {
2499    
2500     }
2501    
2502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2503     line => $self->{line_prev},
2504     column => $self->{column_prev} - 1);
2505     ## Reconsume.
2506     $self->{state} = BOGUS_COMMENT_STATE;
2507     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2508     line => $self->{line_prev},
2509     column => $self->{column_prev} - 1,
2510     };
2511     redo A;
2512     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2513     if ($self->{nc} == 0x002D) { # -
2514    
2515     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2516     line => $self->{line_prev},
2517     column => $self->{column_prev} - 2,
2518     };
2519 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2520 wakaba 1.1
2521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2522     $self->{line_prev} = $self->{line};
2523     $self->{column_prev} = $self->{column};
2524     $self->{column}++;
2525     $self->{nc}
2526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2527     } else {
2528     $self->{set_nc}->($self);
2529     }
2530    
2531     redo A;
2532     } else {
2533    
2534     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2535     line => $self->{line_prev},
2536     column => $self->{column_prev} - 2);
2537     $self->{state} = BOGUS_COMMENT_STATE;
2538     ## Reconsume.
2539     $self->{ct} = {type => COMMENT_TOKEN,
2540     data => '-',
2541     line => $self->{line_prev},
2542     column => $self->{column_prev} - 2,
2543     };
2544     redo A;
2545     }
2546     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2547     ## ASCII case-insensitive.
2548     if ($self->{nc} == [
2549     undef,
2550     0x004F, # O
2551     0x0043, # C
2552     0x0054, # T
2553     0x0059, # Y
2554     0x0050, # P
2555 wakaba 1.12 ]->[length $self->{kwd}] or
2556 wakaba 1.1 $self->{nc} == [
2557     undef,
2558     0x006F, # o
2559     0x0063, # c
2560     0x0074, # t
2561     0x0079, # y
2562     0x0070, # p
2563 wakaba 1.12 ]->[length $self->{kwd}]) {
2564 wakaba 1.1
2565     ## Stay in the state.
2566 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2567 wakaba 1.1
2568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2569     $self->{line_prev} = $self->{line};
2570     $self->{column_prev} = $self->{column};
2571     $self->{column}++;
2572     $self->{nc}
2573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2574     } else {
2575     $self->{set_nc}->($self);
2576     }
2577    
2578     redo A;
2579 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2580 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2581     $self->{nc} == 0x0065)) { # e
2582 wakaba 1.12 if ($self->{is_xml} and
2583     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2584 wakaba 1.10
2585     ## XML5: case-sensitive.
2586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2587     text => 'DOCTYPE',
2588     line => $self->{line_prev},
2589     column => $self->{column_prev} - 5);
2590     } else {
2591    
2592     }
2593 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2594     $self->{ct} = {type => DOCTYPE_TOKEN,
2595     quirks => 1,
2596     line => $self->{line_prev},
2597     column => $self->{column_prev} - 7,
2598     };
2599    
2600     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2601     $self->{line_prev} = $self->{line};
2602     $self->{column_prev} = $self->{column};
2603     $self->{column}++;
2604     $self->{nc}
2605     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2606     } else {
2607     $self->{set_nc}->($self);
2608     }
2609    
2610     redo A;
2611     } else {
2612    
2613     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2614     line => $self->{line_prev},
2615 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2616 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2617     ## Reconsume.
2618     $self->{ct} = {type => COMMENT_TOKEN,
2619 wakaba 1.12 data => $self->{kwd},
2620 wakaba 1.1 line => $self->{line_prev},
2621 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2622 wakaba 1.1 };
2623     redo A;
2624     }
2625     } elsif ($self->{state} == MD_CDATA_STATE) {
2626     if ($self->{nc} == {
2627     '[' => 0x0043, # C
2628     '[C' => 0x0044, # D
2629     '[CD' => 0x0041, # A
2630     '[CDA' => 0x0054, # T
2631     '[CDAT' => 0x0041, # A
2632 wakaba 1.12 }->{$self->{kwd}}) {
2633 wakaba 1.1
2634     ## Stay in the state.
2635 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2636 wakaba 1.1
2637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638     $self->{line_prev} = $self->{line};
2639     $self->{column_prev} = $self->{column};
2640     $self->{column}++;
2641     $self->{nc}
2642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643     } else {
2644     $self->{set_nc}->($self);
2645     }
2646    
2647     redo A;
2648 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2649 wakaba 1.1 $self->{nc} == 0x005B) { # [
2650 wakaba 1.6 if ($self->{is_xml} and
2651     not $self->{tainted} and
2652     @{$self->{open_elements} or []} == 0) {
2653 wakaba 1.8
2654 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2655     line => $self->{line_prev},
2656     column => $self->{column_prev} - 7);
2657     $self->{tainted} = 1;
2658 wakaba 1.8 } else {
2659    
2660 wakaba 1.6 }
2661    
2662 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2663     data => '',
2664     line => $self->{line_prev},
2665     column => $self->{column_prev} - 7};
2666     $self->{state} = CDATA_SECTION_STATE;
2667    
2668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2669     $self->{line_prev} = $self->{line};
2670     $self->{column_prev} = $self->{column};
2671     $self->{column}++;
2672     $self->{nc}
2673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2674     } else {
2675     $self->{set_nc}->($self);
2676     }
2677    
2678     redo A;
2679     } else {
2680    
2681     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2682     line => $self->{line_prev},
2683 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2684 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2685     ## Reconsume.
2686     $self->{ct} = {type => COMMENT_TOKEN,
2687 wakaba 1.12 data => $self->{kwd},
2688 wakaba 1.1 line => $self->{line_prev},
2689 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2690 wakaba 1.1 };
2691     redo A;
2692     }
2693     } elsif ($self->{state} == COMMENT_START_STATE) {
2694     if ($self->{nc} == 0x002D) { # -
2695    
2696     $self->{state} = COMMENT_START_DASH_STATE;
2697    
2698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2699     $self->{line_prev} = $self->{line};
2700     $self->{column_prev} = $self->{column};
2701     $self->{column}++;
2702     $self->{nc}
2703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2704     } else {
2705     $self->{set_nc}->($self);
2706     }
2707    
2708     redo A;
2709     } elsif ($self->{nc} == 0x003E) { # >
2710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2711 wakaba 1.13 if ($self->{in_subset}) {
2712    
2713     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2714     } else {
2715    
2716     $self->{state} = DATA_STATE;
2717     $self->{s_kwd} = '';
2718     }
2719 wakaba 1.1
2720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2721     $self->{line_prev} = $self->{line};
2722     $self->{column_prev} = $self->{column};
2723     $self->{column}++;
2724     $self->{nc}
2725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2726     } else {
2727     $self->{set_nc}->($self);
2728     }
2729    
2730    
2731     return ($self->{ct}); # comment
2732    
2733     redo A;
2734     } elsif ($self->{nc} == -1) {
2735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736 wakaba 1.13 if ($self->{in_subset}) {
2737    
2738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2739     } else {
2740    
2741     $self->{state} = DATA_STATE;
2742     $self->{s_kwd} = '';
2743     }
2744 wakaba 1.1 ## reconsume
2745    
2746     return ($self->{ct}); # comment
2747    
2748     redo A;
2749     } else {
2750    
2751     $self->{ct}->{data} # comment
2752     .= chr ($self->{nc});
2753     $self->{state} = COMMENT_STATE;
2754    
2755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2756     $self->{line_prev} = $self->{line};
2757     $self->{column_prev} = $self->{column};
2758     $self->{column}++;
2759     $self->{nc}
2760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2761     } else {
2762     $self->{set_nc}->($self);
2763     }
2764    
2765     redo A;
2766     }
2767     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2768     if ($self->{nc} == 0x002D) { # -
2769    
2770     $self->{state} = COMMENT_END_STATE;
2771    
2772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2773     $self->{line_prev} = $self->{line};
2774     $self->{column_prev} = $self->{column};
2775     $self->{column}++;
2776     $self->{nc}
2777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2778     } else {
2779     $self->{set_nc}->($self);
2780     }
2781    
2782     redo A;
2783     } elsif ($self->{nc} == 0x003E) { # >
2784     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2785 wakaba 1.13 if ($self->{in_subset}) {
2786    
2787     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2788     } else {
2789    
2790     $self->{state} = DATA_STATE;
2791     $self->{s_kwd} = '';
2792     }
2793 wakaba 1.1
2794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2795     $self->{line_prev} = $self->{line};
2796     $self->{column_prev} = $self->{column};
2797     $self->{column}++;
2798     $self->{nc}
2799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2800     } else {
2801     $self->{set_nc}->($self);
2802     }
2803    
2804    
2805     return ($self->{ct}); # comment
2806    
2807     redo A;
2808     } elsif ($self->{nc} == -1) {
2809     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2810 wakaba 1.13 if ($self->{in_subset}) {
2811    
2812     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2813     } else {
2814    
2815     $self->{state} = DATA_STATE;
2816     $self->{s_kwd} = '';
2817     }
2818 wakaba 1.1 ## reconsume
2819    
2820     return ($self->{ct}); # comment
2821    
2822     redo A;
2823     } else {
2824    
2825     $self->{ct}->{data} # comment
2826     .= '-' . chr ($self->{nc});
2827     $self->{state} = COMMENT_STATE;
2828    
2829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2830     $self->{line_prev} = $self->{line};
2831     $self->{column_prev} = $self->{column};
2832     $self->{column}++;
2833     $self->{nc}
2834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2835     } else {
2836     $self->{set_nc}->($self);
2837     }
2838    
2839     redo A;
2840     }
2841     } elsif ($self->{state} == COMMENT_STATE) {
2842 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2843    
2844 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2845    
2846     $self->{state} = COMMENT_END_DASH_STATE;
2847    
2848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2849     $self->{line_prev} = $self->{line};
2850     $self->{column_prev} = $self->{column};
2851     $self->{column}++;
2852     $self->{nc}
2853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2854     } else {
2855     $self->{set_nc}->($self);
2856     }
2857    
2858     redo A;
2859     } elsif ($self->{nc} == -1) {
2860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2861 wakaba 1.13 if ($self->{in_subset}) {
2862    
2863     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2864     } else {
2865    
2866     $self->{state} = DATA_STATE;
2867     $self->{s_kwd} = '';
2868     }
2869 wakaba 1.1 ## reconsume
2870    
2871     return ($self->{ct}); # comment
2872    
2873     redo A;
2874     } else {
2875    
2876     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2877     $self->{read_until}->($self->{ct}->{data},
2878     q[-],
2879     length $self->{ct}->{data});
2880    
2881     ## Stay in the state
2882    
2883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2884     $self->{line_prev} = $self->{line};
2885     $self->{column_prev} = $self->{column};
2886     $self->{column}++;
2887     $self->{nc}
2888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2889     } else {
2890     $self->{set_nc}->($self);
2891     }
2892    
2893     redo A;
2894     }
2895     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2896 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2897 wakaba 1.10
2898 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2899    
2900     $self->{state} = COMMENT_END_STATE;
2901    
2902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2903     $self->{line_prev} = $self->{line};
2904     $self->{column_prev} = $self->{column};
2905     $self->{column}++;
2906     $self->{nc}
2907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2908     } else {
2909     $self->{set_nc}->($self);
2910     }
2911    
2912     redo A;
2913     } elsif ($self->{nc} == -1) {
2914     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2915 wakaba 1.13 if ($self->{in_subset}) {
2916    
2917     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2918     } else {
2919    
2920     $self->{state} = DATA_STATE;
2921     $self->{s_kwd} = '';
2922     }
2923 wakaba 1.1 ## reconsume
2924    
2925     return ($self->{ct}); # comment
2926    
2927     redo A;
2928     } else {
2929    
2930     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2931     $self->{state} = COMMENT_STATE;
2932    
2933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2934     $self->{line_prev} = $self->{line};
2935     $self->{column_prev} = $self->{column};
2936     $self->{column}++;
2937     $self->{nc}
2938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2939     } else {
2940     $self->{set_nc}->($self);
2941     }
2942    
2943     redo A;
2944     }
2945     } elsif ($self->{state} == COMMENT_END_STATE) {
2946 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2947    
2948 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2949 wakaba 1.13 if ($self->{in_subset}) {
2950    
2951     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2952     } else {
2953    
2954     $self->{state} = DATA_STATE;
2955     $self->{s_kwd} = '';
2956     }
2957 wakaba 1.1
2958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959     $self->{line_prev} = $self->{line};
2960     $self->{column_prev} = $self->{column};
2961     $self->{column}++;
2962     $self->{nc}
2963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964     } else {
2965     $self->{set_nc}->($self);
2966     }
2967    
2968    
2969     return ($self->{ct}); # comment
2970    
2971     redo A;
2972     } elsif ($self->{nc} == 0x002D) { # -
2973    
2974 wakaba 1.10 ## XML5: Not a parse error.
2975 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2976     line => $self->{line_prev},
2977     column => $self->{column_prev});
2978     $self->{ct}->{data} .= '-'; # comment
2979     ## Stay in the state
2980    
2981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2982     $self->{line_prev} = $self->{line};
2983     $self->{column_prev} = $self->{column};
2984     $self->{column}++;
2985     $self->{nc}
2986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2987     } else {
2988     $self->{set_nc}->($self);
2989     }
2990    
2991     redo A;
2992     } elsif ($self->{nc} == -1) {
2993     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2994 wakaba 1.13 if ($self->{in_subset}) {
2995    
2996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2997     } else {
2998    
2999     $self->{state} = DATA_STATE;
3000     $self->{s_kwd} = '';
3001     }
3002 wakaba 1.1 ## reconsume
3003    
3004     return ($self->{ct}); # comment
3005    
3006     redo A;
3007     } else {
3008    
3009     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3010     $self->{state} = COMMENT_STATE;
3011    
3012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3013     $self->{line_prev} = $self->{line};
3014     $self->{column_prev} = $self->{column};
3015     $self->{column}++;
3016     $self->{nc}
3017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3018     } else {
3019     $self->{set_nc}->($self);
3020     }
3021    
3022     redo A;
3023     }
3024     } elsif ($self->{state} == DOCTYPE_STATE) {
3025     if ($is_space->{$self->{nc}}) {
3026    
3027     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3028    
3029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3030     $self->{line_prev} = $self->{line};
3031     $self->{column_prev} = $self->{column};
3032     $self->{column}++;
3033     $self->{nc}
3034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3035     } else {
3036     $self->{set_nc}->($self);
3037     }
3038    
3039     redo A;
3040 wakaba 1.28 } elsif ($self->{nc} == -1) {
3041    
3042     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3043     $self->{ct}->{quirks} = 1;
3044    
3045     $self->{state} = DATA_STATE;
3046     ## Reconsume.
3047     return ($self->{ct}); # DOCTYPE (quirks)
3048    
3049     redo A;
3050 wakaba 1.1 } else {
3051    
3052 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3053 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3054     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3055     ## reconsume
3056     redo A;
3057     }
3058     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3059 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3060    
3061 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3062    
3063     ## Stay in the state
3064    
3065     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3066     $self->{line_prev} = $self->{line};
3067     $self->{column_prev} = $self->{column};
3068     $self->{column}++;
3069     $self->{nc}
3070     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3071     } else {
3072     $self->{set_nc}->($self);
3073     }
3074    
3075     redo A;
3076     } elsif ($self->{nc} == 0x003E) { # >
3077    
3078 wakaba 1.12 ## XML5: No parse error.
3079 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3080     $self->{state} = DATA_STATE;
3081 wakaba 1.5 $self->{s_kwd} = '';
3082 wakaba 1.1
3083     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3084     $self->{line_prev} = $self->{line};
3085     $self->{column_prev} = $self->{column};
3086     $self->{column}++;
3087     $self->{nc}
3088     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3089     } else {
3090     $self->{set_nc}->($self);
3091     }
3092    
3093    
3094     return ($self->{ct}); # DOCTYPE (quirks)
3095    
3096     redo A;
3097 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3098    
3099     $self->{ct}->{name} # DOCTYPE
3100     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3101     delete $self->{ct}->{quirks};
3102     $self->{state} = DOCTYPE_NAME_STATE;
3103    
3104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105     $self->{line_prev} = $self->{line};
3106     $self->{column_prev} = $self->{column};
3107     $self->{column}++;
3108     $self->{nc}
3109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110     } else {
3111     $self->{set_nc}->($self);
3112     }
3113    
3114     redo A;
3115 wakaba 1.1 } elsif ($self->{nc} == -1) {
3116    
3117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3118     $self->{state} = DATA_STATE;
3119 wakaba 1.5 $self->{s_kwd} = '';
3120 wakaba 1.1 ## reconsume
3121    
3122     return ($self->{ct}); # DOCTYPE (quirks)
3123    
3124     redo A;
3125 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3126    
3127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3128     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3129 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3130     $self->{in_subset} = 1;
3131 wakaba 1.12
3132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3133     $self->{line_prev} = $self->{line};
3134     $self->{column_prev} = $self->{column};
3135     $self->{column}++;
3136     $self->{nc}
3137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3138     } else {
3139     $self->{set_nc}->($self);
3140     }
3141    
3142 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3143 wakaba 1.12 redo A;
3144 wakaba 1.1 } else {
3145    
3146     $self->{ct}->{name} = chr $self->{nc};
3147     delete $self->{ct}->{quirks};
3148     $self->{state} = DOCTYPE_NAME_STATE;
3149    
3150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3151     $self->{line_prev} = $self->{line};
3152     $self->{column_prev} = $self->{column};
3153     $self->{column}++;
3154     $self->{nc}
3155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3156     } else {
3157     $self->{set_nc}->($self);
3158     }
3159    
3160     redo A;
3161     }
3162     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3163 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3164    
3165     ## ISSUE: Redundant "First," in the spec.
3166    
3167 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3168    
3169     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3170    
3171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172     $self->{line_prev} = $self->{line};
3173     $self->{column_prev} = $self->{column};
3174     $self->{column}++;
3175     $self->{nc}
3176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177     } else {
3178     $self->{set_nc}->($self);
3179     }
3180    
3181     redo A;
3182     } elsif ($self->{nc} == 0x003E) { # >
3183    
3184     $self->{state} = DATA_STATE;
3185 wakaba 1.5 $self->{s_kwd} = '';
3186 wakaba 1.1
3187     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188     $self->{line_prev} = $self->{line};
3189     $self->{column_prev} = $self->{column};
3190     $self->{column}++;
3191     $self->{nc}
3192     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193     } else {
3194     $self->{set_nc}->($self);
3195     }
3196    
3197    
3198     return ($self->{ct}); # DOCTYPE
3199    
3200     redo A;
3201 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3202    
3203     $self->{ct}->{name} # DOCTYPE
3204     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3205     delete $self->{ct}->{quirks};
3206     ## Stay in the state.
3207    
3208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209     $self->{line_prev} = $self->{line};
3210     $self->{column_prev} = $self->{column};
3211     $self->{column}++;
3212     $self->{nc}
3213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214     } else {
3215     $self->{set_nc}->($self);
3216     }
3217    
3218     redo A;
3219 wakaba 1.1 } elsif ($self->{nc} == -1) {
3220    
3221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3222     $self->{state} = DATA_STATE;
3223 wakaba 1.5 $self->{s_kwd} = '';
3224 wakaba 1.1 ## reconsume
3225    
3226     $self->{ct}->{quirks} = 1;
3227     return ($self->{ct}); # DOCTYPE
3228    
3229     redo A;
3230 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3231    
3232     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3233 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3234     $self->{in_subset} = 1;
3235 wakaba 1.12
3236     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3237     $self->{line_prev} = $self->{line};
3238     $self->{column_prev} = $self->{column};
3239     $self->{column}++;
3240     $self->{nc}
3241     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3242     } else {
3243     $self->{set_nc}->($self);
3244     }
3245    
3246 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3247 wakaba 1.12 redo A;
3248 wakaba 1.1 } else {
3249    
3250 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3251     ## Stay in the state.
3252 wakaba 1.1
3253     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3254     $self->{line_prev} = $self->{line};
3255     $self->{column_prev} = $self->{column};
3256     $self->{column}++;
3257     $self->{nc}
3258     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3259     } else {
3260     $self->{set_nc}->($self);
3261     }
3262    
3263     redo A;
3264     }
3265     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3266 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3267     ## state", but implemented differently.
3268    
3269 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3270    
3271     ## Stay in the state
3272    
3273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3274     $self->{line_prev} = $self->{line};
3275     $self->{column_prev} = $self->{column};
3276     $self->{column}++;
3277     $self->{nc}
3278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3279     } else {
3280     $self->{set_nc}->($self);
3281     }
3282    
3283     redo A;
3284     } elsif ($self->{nc} == 0x003E) { # >
3285 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3286    
3287     $self->{state} = DATA_STATE;
3288     $self->{s_kwd} = '';
3289     } else {
3290    
3291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3292     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3293     }
3294 wakaba 1.1
3295    
3296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3297     $self->{line_prev} = $self->{line};
3298     $self->{column_prev} = $self->{column};
3299     $self->{column}++;
3300     $self->{nc}
3301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3302     } else {
3303     $self->{set_nc}->($self);
3304     }
3305    
3306 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3307 wakaba 1.1 redo A;
3308     } elsif ($self->{nc} == -1) {
3309 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3310    
3311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3312     $self->{state} = DATA_STATE;
3313     $self->{s_kwd} = '';
3314     $self->{ct}->{quirks} = 1;
3315     } else {
3316    
3317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3318     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3319     }
3320 wakaba 1.1
3321 wakaba 1.16 ## Reconsume.
3322     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3323 wakaba 1.1 redo A;
3324     } elsif ($self->{nc} == 0x0050 or # P
3325     $self->{nc} == 0x0070) { # p
3326 wakaba 1.12
3327 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3328 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3329 wakaba 1.1
3330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3331     $self->{line_prev} = $self->{line};
3332     $self->{column_prev} = $self->{column};
3333     $self->{column}++;
3334     $self->{nc}
3335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3336     } else {
3337     $self->{set_nc}->($self);
3338     }
3339    
3340     redo A;
3341     } elsif ($self->{nc} == 0x0053 or # S
3342     $self->{nc} == 0x0073) { # s
3343 wakaba 1.12
3344 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3345 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3346    
3347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3348     $self->{line_prev} = $self->{line};
3349     $self->{column_prev} = $self->{column};
3350     $self->{column}++;
3351     $self->{nc}
3352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3353     } else {
3354     $self->{set_nc}->($self);
3355     }
3356    
3357     redo A;
3358 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3359     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3360     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3361    
3362     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3363     $self->{ct}->{value} = ''; # ENTITY
3364    
3365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3366     $self->{line_prev} = $self->{line};
3367     $self->{column_prev} = $self->{column};
3368     $self->{column}++;
3369     $self->{nc}
3370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3371     } else {
3372     $self->{set_nc}->($self);
3373     }
3374    
3375     redo A;
3376     } elsif ($self->{nc} == 0x0027 and # '
3377     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3378     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3379    
3380     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3381     $self->{ct}->{value} = ''; # ENTITY
3382    
3383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3384     $self->{line_prev} = $self->{line};
3385     $self->{column_prev} = $self->{column};
3386     $self->{column}++;
3387     $self->{nc}
3388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3389     } else {
3390     $self->{set_nc}->($self);
3391     }
3392    
3393     redo A;
3394 wakaba 1.16 } elsif ($self->{is_xml} and
3395     $self->{ct}->{type} == DOCTYPE_TOKEN and
3396     $self->{nc} == 0x005B) { # [
3397 wakaba 1.12
3398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3399     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3400 wakaba 1.13 $self->{in_subset} = 1;
3401 wakaba 1.1
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3413 wakaba 1.1 redo A;
3414     } else {
3415 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3416    
3417     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418    
3419     $self->{ct}->{quirks} = 1;
3420     $self->{state} = BOGUS_DOCTYPE_STATE;
3421     } else {
3422    
3423     $self->{state} = BOGUS_MD_STATE;
3424     }
3425 wakaba 1.1
3426    
3427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428     $self->{line_prev} = $self->{line};
3429     $self->{column_prev} = $self->{column};
3430     $self->{column}++;
3431     $self->{nc}
3432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433     } else {
3434     $self->{set_nc}->($self);
3435     }
3436    
3437     redo A;
3438     }
3439     } elsif ($self->{state} == PUBLIC_STATE) {
3440     ## ASCII case-insensitive
3441     if ($self->{nc} == [
3442     undef,
3443     0x0055, # U
3444     0x0042, # B
3445     0x004C, # L
3446     0x0049, # I
3447 wakaba 1.12 ]->[length $self->{kwd}] or
3448 wakaba 1.1 $self->{nc} == [
3449     undef,
3450     0x0075, # u
3451     0x0062, # b
3452     0x006C, # l
3453     0x0069, # i
3454 wakaba 1.12 ]->[length $self->{kwd}]) {
3455 wakaba 1.1
3456     ## Stay in the state.
3457 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3458 wakaba 1.1
3459     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3460     $self->{line_prev} = $self->{line};
3461     $self->{column_prev} = $self->{column};
3462     $self->{column}++;
3463     $self->{nc}
3464     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3465     } else {
3466     $self->{set_nc}->($self);
3467     }
3468    
3469     redo A;
3470 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3471 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3472     $self->{nc} == 0x0063)) { # c
3473 wakaba 1.12 if ($self->{is_xml} and
3474     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3475    
3476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3477     text => 'PUBLIC',
3478     line => $self->{line_prev},
3479     column => $self->{column_prev} - 4);
3480     } else {
3481    
3482     }
3483 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3484    
3485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3486     $self->{line_prev} = $self->{line};
3487     $self->{column_prev} = $self->{column};
3488     $self->{column}++;
3489     $self->{nc}
3490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3491     } else {
3492     $self->{set_nc}->($self);
3493     }
3494    
3495     redo A;
3496     } else {
3497 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3498 wakaba 1.1 line => $self->{line_prev},
3499 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3500 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501    
3502     $self->{ct}->{quirks} = 1;
3503     $self->{state} = BOGUS_DOCTYPE_STATE;
3504     } else {
3505    
3506     $self->{state} = BOGUS_MD_STATE;
3507     }
3508 wakaba 1.1 ## Reconsume.
3509     redo A;
3510     }
3511     } elsif ($self->{state} == SYSTEM_STATE) {
3512     ## ASCII case-insensitive
3513     if ($self->{nc} == [
3514     undef,
3515     0x0059, # Y
3516     0x0053, # S
3517     0x0054, # T
3518     0x0045, # E
3519 wakaba 1.12 ]->[length $self->{kwd}] or
3520 wakaba 1.1 $self->{nc} == [
3521     undef,
3522     0x0079, # y
3523     0x0073, # s
3524     0x0074, # t
3525     0x0065, # e
3526 wakaba 1.12 ]->[length $self->{kwd}]) {
3527 wakaba 1.1
3528     ## Stay in the state.
3529 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3530 wakaba 1.1
3531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3532     $self->{line_prev} = $self->{line};
3533     $self->{column_prev} = $self->{column};
3534     $self->{column}++;
3535     $self->{nc}
3536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3537     } else {
3538     $self->{set_nc}->($self);
3539     }
3540    
3541     redo A;
3542 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3543 wakaba 1.1 ($self->{nc} == 0x004D or # M
3544     $self->{nc} == 0x006D)) { # m
3545 wakaba 1.12 if ($self->{is_xml} and
3546     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3547    
3548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3549     text => 'SYSTEM',
3550     line => $self->{line_prev},
3551     column => $self->{column_prev} - 4);
3552     } else {
3553    
3554     }
3555 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3556    
3557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3558     $self->{line_prev} = $self->{line};
3559     $self->{column_prev} = $self->{column};
3560     $self->{column}++;
3561     $self->{nc}
3562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3563     } else {
3564     $self->{set_nc}->($self);
3565     }
3566    
3567     redo A;
3568     } else {
3569 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3570 wakaba 1.1 line => $self->{line_prev},
3571 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3572 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3573    
3574     $self->{ct}->{quirks} = 1;
3575     $self->{state} = BOGUS_DOCTYPE_STATE;
3576     } else {
3577    
3578     $self->{state} = BOGUS_MD_STATE;
3579     }
3580 wakaba 1.1 ## Reconsume.
3581     redo A;
3582     }
3583     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3584     if ($is_space->{$self->{nc}}) {
3585    
3586     ## Stay in the state
3587    
3588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3589     $self->{line_prev} = $self->{line};
3590     $self->{column_prev} = $self->{column};
3591     $self->{column}++;
3592     $self->{nc}
3593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3594     } else {
3595     $self->{set_nc}->($self);
3596     }
3597    
3598     redo A;
3599     } elsif ($self->{nc} eq 0x0022) { # "
3600    
3601     $self->{ct}->{pubid} = ''; # DOCTYPE
3602     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3603    
3604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3605     $self->{line_prev} = $self->{line};
3606     $self->{column_prev} = $self->{column};
3607     $self->{column}++;
3608     $self->{nc}
3609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3610     } else {
3611     $self->{set_nc}->($self);
3612     }
3613    
3614     redo A;
3615     } elsif ($self->{nc} eq 0x0027) { # '
3616    
3617     $self->{ct}->{pubid} = ''; # DOCTYPE
3618     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3619    
3620     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3621     $self->{line_prev} = $self->{line};
3622     $self->{column_prev} = $self->{column};
3623     $self->{column}++;
3624     $self->{nc}
3625     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3626     } else {
3627     $self->{set_nc}->($self);
3628     }
3629    
3630     redo A;
3631     } elsif ($self->{nc} eq 0x003E) { # >
3632 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3633    
3634     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3635    
3636     $self->{state} = DATA_STATE;
3637     $self->{s_kwd} = '';
3638     $self->{ct}->{quirks} = 1;
3639     } else {
3640    
3641     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3642     }
3643 wakaba 1.1
3644    
3645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3646     $self->{line_prev} = $self->{line};
3647     $self->{column_prev} = $self->{column};
3648     $self->{column}++;
3649     $self->{nc}
3650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3651     } else {
3652     $self->{set_nc}->($self);
3653     }
3654    
3655 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3656 wakaba 1.1 redo A;
3657     } elsif ($self->{nc} == -1) {
3658 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3659    
3660     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3661     $self->{state} = DATA_STATE;
3662     $self->{s_kwd} = '';
3663     $self->{ct}->{quirks} = 1;
3664     } else {
3665    
3666     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3667     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3668     }
3669 wakaba 1.1
3670     ## reconsume
3671     return ($self->{ct}); # DOCTYPE
3672     redo A;
3673 wakaba 1.16 } elsif ($self->{is_xml} and
3674     $self->{ct}->{type} == DOCTYPE_TOKEN and
3675     $self->{nc} == 0x005B) { # [
3676 wakaba 1.12
3677     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3678     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3679     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3680 wakaba 1.13 $self->{in_subset} = 1;
3681 wakaba 1.12
3682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3683     $self->{line_prev} = $self->{line};
3684     $self->{column_prev} = $self->{column};
3685     $self->{column}++;
3686     $self->{nc}
3687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3688     } else {
3689     $self->{set_nc}->($self);
3690     }
3691    
3692 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3693 wakaba 1.12 redo A;
3694 wakaba 1.1 } else {
3695     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3696    
3697 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3698    
3699     $self->{ct}->{quirks} = 1;
3700     $self->{state} = BOGUS_DOCTYPE_STATE;
3701     } else {
3702    
3703     $self->{state} = BOGUS_MD_STATE;
3704     }
3705    
3706 wakaba 1.1
3707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3708     $self->{line_prev} = $self->{line};
3709     $self->{column_prev} = $self->{column};
3710     $self->{column}++;
3711     $self->{nc}
3712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3713     } else {
3714     $self->{set_nc}->($self);
3715     }
3716    
3717     redo A;
3718     }
3719     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3720     if ($self->{nc} == 0x0022) { # "
3721    
3722     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3723    
3724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3725     $self->{line_prev} = $self->{line};
3726     $self->{column_prev} = $self->{column};
3727     $self->{column}++;
3728     $self->{nc}
3729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3730     } else {
3731     $self->{set_nc}->($self);
3732     }
3733    
3734     redo A;
3735     } elsif ($self->{nc} == 0x003E) { # >
3736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3737    
3738 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3739    
3740     $self->{state} = DATA_STATE;
3741     $self->{s_kwd} = '';
3742     $self->{ct}->{quirks} = 1;
3743     } else {
3744    
3745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3746     }
3747    
3748 wakaba 1.1
3749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3750     $self->{line_prev} = $self->{line};
3751     $self->{column_prev} = $self->{column};
3752     $self->{column}++;
3753     $self->{nc}
3754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3755     } else {
3756     $self->{set_nc}->($self);
3757     }
3758    
3759 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3760 wakaba 1.1 redo A;
3761     } elsif ($self->{nc} == -1) {
3762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3763    
3764 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3765    
3766     $self->{state} = DATA_STATE;
3767     $self->{s_kwd} = '';
3768     $self->{ct}->{quirks} = 1;
3769     } else {
3770    
3771     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3772     }
3773    
3774     ## Reconsume.
3775 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3776     redo A;
3777     } else {
3778    
3779 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3780 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3781     length $self->{ct}->{pubid});
3782    
3783     ## Stay in the state
3784    
3785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786     $self->{line_prev} = $self->{line};
3787     $self->{column_prev} = $self->{column};
3788     $self->{column}++;
3789     $self->{nc}
3790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791     } else {
3792     $self->{set_nc}->($self);
3793     }
3794    
3795     redo A;
3796     }
3797     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3798     if ($self->{nc} == 0x0027) { # '
3799    
3800     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3801    
3802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3803     $self->{line_prev} = $self->{line};
3804     $self->{column_prev} = $self->{column};
3805     $self->{column}++;
3806     $self->{nc}
3807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3808     } else {
3809     $self->{set_nc}->($self);
3810     }
3811    
3812     redo A;
3813     } elsif ($self->{nc} == 0x003E) { # >
3814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3815    
3816 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3817    
3818     $self->{state} = DATA_STATE;
3819     $self->{s_kwd} = '';
3820     $self->{ct}->{quirks} = 1;
3821     } else {
3822    
3823     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3824     }
3825    
3826 wakaba 1.1
3827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3828     $self->{line_prev} = $self->{line};
3829     $self->{column_prev} = $self->{column};
3830     $self->{column}++;
3831     $self->{nc}
3832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3833     } else {
3834     $self->{set_nc}->($self);
3835     }
3836    
3837 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3838 wakaba 1.1 redo A;
3839     } elsif ($self->{nc} == -1) {
3840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3841    
3842 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3843    
3844     $self->{state} = DATA_STATE;
3845     $self->{s_kwd} = '';
3846     $self->{ct}->{quirks} = 1;
3847     } else {
3848    
3849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850     }
3851    
3852 wakaba 1.1 ## reconsume
3853 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3854 wakaba 1.1 redo A;
3855     } else {
3856    
3857 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3858 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3859     length $self->{ct}->{pubid});
3860    
3861     ## Stay in the state
3862    
3863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864     $self->{line_prev} = $self->{line};
3865     $self->{column_prev} = $self->{column};
3866     $self->{column}++;
3867     $self->{nc}
3868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869     } else {
3870     $self->{set_nc}->($self);
3871     }
3872    
3873     redo A;
3874     }
3875     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3876     if ($is_space->{$self->{nc}}) {
3877    
3878     ## Stay in the state
3879    
3880     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3881     $self->{line_prev} = $self->{line};
3882     $self->{column_prev} = $self->{column};
3883     $self->{column}++;
3884     $self->{nc}
3885     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3886     } else {
3887     $self->{set_nc}->($self);
3888     }
3889    
3890     redo A;
3891     } elsif ($self->{nc} == 0x0022) { # "
3892    
3893 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3894 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3895    
3896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3897     $self->{line_prev} = $self->{line};
3898     $self->{column_prev} = $self->{column};
3899     $self->{column}++;
3900     $self->{nc}
3901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3902     } else {
3903     $self->{set_nc}->($self);
3904     }
3905    
3906     redo A;
3907     } elsif ($self->{nc} == 0x0027) { # '
3908    
3909 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3910 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3911    
3912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3913     $self->{line_prev} = $self->{line};
3914     $self->{column_prev} = $self->{column};
3915     $self->{column}++;
3916     $self->{nc}
3917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3918     } else {
3919     $self->{set_nc}->($self);
3920     }
3921    
3922     redo A;
3923     } elsif ($self->{nc} == 0x003E) { # >
3924 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925     if ($self->{is_xml}) {
3926    
3927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928     } else {
3929    
3930     }
3931     $self->{state} = DATA_STATE;
3932     $self->{s_kwd} = '';
3933 wakaba 1.12 } else {
3934 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3935    
3936     } else {
3937    
3938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3939     }
3940     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3941 wakaba 1.12 }
3942 wakaba 1.16
3943 wakaba 1.1
3944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945     $self->{line_prev} = $self->{line};
3946     $self->{column_prev} = $self->{column};
3947     $self->{column}++;
3948     $self->{nc}
3949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3950     } else {
3951     $self->{set_nc}->($self);
3952     }
3953    
3954 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3955 wakaba 1.1 redo A;
3956     } elsif ($self->{nc} == -1) {
3957 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3958    
3959     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3960    
3961     $self->{state} = DATA_STATE;
3962     $self->{s_kwd} = '';
3963     $self->{ct}->{quirks} = 1;
3964     } else {
3965     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3966     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3967     }
3968 wakaba 1.1
3969     ## reconsume
3970 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3971 wakaba 1.1 redo A;
3972 wakaba 1.16 } elsif ($self->{is_xml} and
3973     $self->{ct}->{type} == DOCTYPE_TOKEN and
3974     $self->{nc} == 0x005B) { # [
3975 wakaba 1.12
3976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3977     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3978     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3979 wakaba 1.13 $self->{in_subset} = 1;
3980 wakaba 1.12
3981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982     $self->{line_prev} = $self->{line};
3983     $self->{column_prev} = $self->{column};
3984     $self->{column}++;
3985     $self->{nc}
3986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987     } else {
3988     $self->{set_nc}->($self);
3989     }
3990    
3991 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3992 wakaba 1.12 redo A;
3993 wakaba 1.1 } else {
3994     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3995    
3996 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3997    
3998     $self->{ct}->{quirks} = 1;
3999     $self->{state} = BOGUS_DOCTYPE_STATE;
4000     } else {
4001    
4002     $self->{state} = BOGUS_MD_STATE;
4003     }
4004    
4005 wakaba 1.1
4006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4007     $self->{line_prev} = $self->{line};
4008     $self->{column_prev} = $self->{column};
4009     $self->{column}++;
4010     $self->{nc}
4011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4012     } else {
4013     $self->{set_nc}->($self);
4014     }
4015    
4016     redo A;
4017     }
4018     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4019     if ($is_space->{$self->{nc}}) {
4020    
4021     ## Stay in the state
4022    
4023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4024     $self->{line_prev} = $self->{line};
4025     $self->{column_prev} = $self->{column};
4026     $self->{column}++;
4027     $self->{nc}
4028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4029     } else {
4030     $self->{set_nc}->($self);
4031     }
4032    
4033     redo A;
4034     } elsif ($self->{nc} == 0x0022) { # "
4035    
4036     $self->{ct}->{sysid} = ''; # DOCTYPE
4037     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4038    
4039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040     $self->{line_prev} = $self->{line};
4041     $self->{column_prev} = $self->{column};
4042     $self->{column}++;
4043     $self->{nc}
4044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045     } else {
4046     $self->{set_nc}->($self);
4047     }
4048    
4049     redo A;
4050     } elsif ($self->{nc} == 0x0027) { # '
4051    
4052     $self->{ct}->{sysid} = ''; # DOCTYPE
4053     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4054    
4055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4056     $self->{line_prev} = $self->{line};
4057     $self->{column_prev} = $self->{column};
4058     $self->{column}++;
4059     $self->{nc}
4060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4061     } else {
4062     $self->{set_nc}->($self);
4063     }
4064    
4065     redo A;
4066     } elsif ($self->{nc} == 0x003E) { # >
4067     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4068    
4069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070     $self->{line_prev} = $self->{line};
4071     $self->{column_prev} = $self->{column};
4072     $self->{column}++;
4073     $self->{nc}
4074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075     } else {
4076     $self->{set_nc}->($self);
4077     }
4078    
4079    
4080 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4081    
4082     $self->{state} = DATA_STATE;
4083     $self->{s_kwd} = '';
4084     $self->{ct}->{quirks} = 1;
4085     } else {
4086    
4087     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4088     }
4089 wakaba 1.1
4090 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4091 wakaba 1.1 redo A;
4092     } elsif ($self->{nc} == -1) {
4093 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4094    
4095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4096     $self->{state} = DATA_STATE;
4097     $self->{s_kwd} = '';
4098     $self->{ct}->{quirks} = 1;
4099     } else {
4100    
4101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4102     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4103     }
4104 wakaba 1.1
4105     ## reconsume
4106 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4107 wakaba 1.1 redo A;
4108 wakaba 1.16 } elsif ($self->{is_xml} and
4109     $self->{ct}->{type} == DOCTYPE_TOKEN and
4110     $self->{nc} == 0x005B) { # [
4111 wakaba 1.12
4112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4113    
4114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4115     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4116 wakaba 1.13 $self->{in_subset} = 1;
4117 wakaba 1.12
4118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119     $self->{line_prev} = $self->{line};
4120     $self->{column_prev} = $self->{column};
4121     $self->{column}++;
4122     $self->{nc}
4123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124     } else {
4125     $self->{set_nc}->($self);
4126     }
4127    
4128 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4129 wakaba 1.12 redo A;
4130 wakaba 1.1 } else {
4131     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4132    
4133 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4134    
4135     $self->{ct}->{quirks} = 1;
4136     $self->{state} = BOGUS_DOCTYPE_STATE;
4137     } else {
4138    
4139     $self->{state} = BOGUS_MD_STATE;
4140     }
4141    
4142 wakaba 1.1
4143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144     $self->{line_prev} = $self->{line};
4145     $self->{column_prev} = $self->{column};
4146     $self->{column}++;
4147     $self->{nc}
4148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4149     } else {
4150     $self->{set_nc}->($self);
4151     }
4152    
4153     redo A;
4154     }
4155     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4156     if ($self->{nc} == 0x0022) { # "
4157    
4158     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4159    
4160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4161     $self->{line_prev} = $self->{line};
4162     $self->{column_prev} = $self->{column};
4163     $self->{column}++;
4164     $self->{nc}
4165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4166     } else {
4167     $self->{set_nc}->($self);
4168     }
4169    
4170     redo A;
4171 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4172 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4173    
4174 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4175    
4176     $self->{state} = DATA_STATE;
4177     $self->{s_kwd} = '';
4178     $self->{ct}->{quirks} = 1;
4179     } else {
4180    
4181     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4182     }
4183    
4184 wakaba 1.1
4185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4186     $self->{line_prev} = $self->{line};
4187     $self->{column_prev} = $self->{column};
4188     $self->{column}++;
4189     $self->{nc}
4190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4191     } else {
4192     $self->{set_nc}->($self);
4193     }
4194    
4195 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4196 wakaba 1.1 redo A;
4197     } elsif ($self->{nc} == -1) {
4198     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4199    
4200 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4201    
4202     $self->{state} = DATA_STATE;
4203     $self->{s_kwd} = '';
4204     $self->{ct}->{quirks} = 1;
4205     } else {
4206    
4207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208     }
4209    
4210 wakaba 1.1 ## reconsume
4211 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4212 wakaba 1.1 redo A;
4213     } else {
4214    
4215 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4216 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4217     length $self->{ct}->{sysid});
4218    
4219     ## Stay in the state
4220    
4221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4222     $self->{line_prev} = $self->{line};
4223     $self->{column_prev} = $self->{column};
4224     $self->{column}++;
4225     $self->{nc}
4226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4227     } else {
4228     $self->{set_nc}->($self);
4229     }
4230    
4231     redo A;
4232     }
4233     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4234     if ($self->{nc} == 0x0027) { # '
4235    
4236     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4237    
4238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4239     $self->{line_prev} = $self->{line};
4240     $self->{column_prev} = $self->{column};
4241     $self->{column}++;
4242     $self->{nc}
4243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4244     } else {
4245     $self->{set_nc}->($self);
4246     }
4247    
4248     redo A;
4249 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4250 wakaba 1.1
4251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4252    
4253     $self->{state} = DATA_STATE;
4254 wakaba 1.5 $self->{s_kwd} = '';
4255 wakaba 1.1
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266    
4267     $self->{ct}->{quirks} = 1;
4268     return ($self->{ct}); # DOCTYPE
4269    
4270     redo A;
4271     } elsif ($self->{nc} == -1) {
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4273    
4274 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4275    
4276     $self->{state} = DATA_STATE;
4277     $self->{s_kwd} = '';
4278     $self->{ct}->{quirks} = 1;
4279     } else {
4280    
4281     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4282     }
4283    
4284 wakaba 1.1 ## reconsume
4285 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4286 wakaba 1.1 redo A;
4287     } else {
4288    
4289 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4290 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4291     length $self->{ct}->{sysid});
4292    
4293     ## Stay in the state
4294    
4295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4296     $self->{line_prev} = $self->{line};
4297     $self->{column_prev} = $self->{column};
4298     $self->{column}++;
4299     $self->{nc}
4300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4301     } else {
4302     $self->{set_nc}->($self);
4303     }
4304    
4305     redo A;
4306     }
4307     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4308     if ($is_space->{$self->{nc}}) {
4309 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4310    
4311     $self->{state} = BEFORE_NDATA_STATE;
4312     } else {
4313    
4314     ## Stay in the state
4315     }
4316 wakaba 1.1
4317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4318     $self->{line_prev} = $self->{line};
4319     $self->{column_prev} = $self->{column};
4320     $self->{column}++;
4321     $self->{nc}
4322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4323     } else {
4324     $self->{set_nc}->($self);
4325     }
4326    
4327     redo A;
4328     } elsif ($self->{nc} == 0x003E) { # >
4329 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4330    
4331     $self->{state} = DATA_STATE;
4332     $self->{s_kwd} = '';
4333     } else {
4334    
4335     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4336     }
4337    
4338 wakaba 1.1
4339     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4340     $self->{line_prev} = $self->{line};
4341     $self->{column_prev} = $self->{column};
4342     $self->{column}++;
4343     $self->{nc}
4344     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4345     } else {
4346     $self->{set_nc}->($self);
4347     }
4348    
4349 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4350 wakaba 1.1 redo A;
4351 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4352     ($self->{nc} == 0x004E or # N
4353     $self->{nc} == 0x006E)) { # n
4354    
4355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4356     $self->{state} = NDATA_STATE;
4357     $self->{kwd} = chr $self->{nc};
4358    
4359     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4360     $self->{line_prev} = $self->{line};
4361     $self->{column_prev} = $self->{column};
4362     $self->{column}++;
4363     $self->{nc}
4364     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4365     } else {
4366     $self->{set_nc}->($self);
4367     }
4368    
4369     redo A;
4370 wakaba 1.1 } elsif ($self->{nc} == -1) {
4371 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4372    
4373     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4374     $self->{state} = DATA_STATE;
4375     $self->{s_kwd} = '';
4376     $self->{ct}->{quirks} = 1;
4377     } else {
4378    
4379     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4380     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381     }
4382    
4383 wakaba 1.1 ## reconsume
4384 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4385 wakaba 1.1 redo A;
4386 wakaba 1.16 } elsif ($self->{is_xml} and
4387     $self->{ct}->{type} == DOCTYPE_TOKEN and
4388     $self->{nc} == 0x005B) { # [
4389 wakaba 1.12
4390     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4392 wakaba 1.13 $self->{in_subset} = 1;
4393 wakaba 1.12
4394     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4395     $self->{line_prev} = $self->{line};
4396     $self->{column_prev} = $self->{column};
4397     $self->{column}++;
4398     $self->{nc}
4399     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4400     } else {
4401     $self->{set_nc}->($self);
4402     }
4403    
4404 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4405 wakaba 1.12 redo A;
4406 wakaba 1.1 } else {
4407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4408    
4409 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4410    
4411     #$self->{ct}->{quirks} = 1;
4412     $self->{state} = BOGUS_DOCTYPE_STATE;
4413     } else {
4414    
4415     $self->{state} = BOGUS_MD_STATE;
4416     }
4417    
4418 wakaba 1.1
4419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420     $self->{line_prev} = $self->{line};
4421     $self->{column_prev} = $self->{column};
4422     $self->{column}++;
4423     $self->{nc}
4424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425     } else {
4426     $self->{set_nc}->($self);
4427     }
4428    
4429     redo A;
4430     }
4431 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4432     if ($is_space->{$self->{nc}}) {
4433    
4434     ## Stay in the state.
4435    
4436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4437     $self->{line_prev} = $self->{line};
4438     $self->{column_prev} = $self->{column};
4439     $self->{column}++;
4440     $self->{nc}
4441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4442     } else {
4443     $self->{set_nc}->($self);
4444     }
4445    
4446     redo A;
4447     } elsif ($self->{nc} == 0x003E) { # >
4448    
4449     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4450    
4451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452     $self->{line_prev} = $self->{line};
4453     $self->{column_prev} = $self->{column};
4454     $self->{column}++;
4455     $self->{nc}
4456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457     } else {
4458     $self->{set_nc}->($self);
4459     }
4460    
4461     return ($self->{ct}); # ENTITY
4462     redo A;
4463     } elsif ($self->{nc} == 0x004E or # N
4464     $self->{nc} == 0x006E) { # n
4465    
4466     $self->{state} = NDATA_STATE;
4467     $self->{kwd} = chr $self->{nc};
4468    
4469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4470     $self->{line_prev} = $self->{line};
4471     $self->{column_prev} = $self->{column};
4472     $self->{column}++;
4473     $self->{nc}
4474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4475     } else {
4476     $self->{set_nc}->($self);
4477     }
4478    
4479     redo A;
4480     } elsif ($self->{nc} == -1) {
4481    
4482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4483     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4484     ## reconsume
4485     return ($self->{ct}); # ENTITY
4486     redo A;
4487     } else {
4488    
4489     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4490     $self->{state} = BOGUS_MD_STATE;
4491    
4492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493     $self->{line_prev} = $self->{line};
4494     $self->{column_prev} = $self->{column};
4495     $self->{column}++;
4496     $self->{nc}
4497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498     } else {
4499     $self->{set_nc}->($self);
4500     }
4501    
4502     redo A;
4503     }
4504 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4505     if ($self->{nc} == 0x003E) { # >
4506    
4507     $self->{state} = DATA_STATE;
4508 wakaba 1.5 $self->{s_kwd} = '';
4509 wakaba 1.1
4510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4511     $self->{line_prev} = $self->{line};
4512     $self->{column_prev} = $self->{column};
4513     $self->{column}++;
4514     $self->{nc}
4515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4516     } else {
4517     $self->{set_nc}->($self);
4518     }
4519    
4520    
4521     return ($self->{ct}); # DOCTYPE
4522    
4523     redo A;
4524 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4525 wakaba 1.13
4526     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4527     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4528     $self->{in_subset} = 1;
4529    
4530 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4531     $self->{line_prev} = $self->{line};
4532     $self->{column_prev} = $self->{column};
4533     $self->{column}++;
4534     $self->{nc}
4535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4536     } else {
4537     $self->{set_nc}->($self);
4538     }
4539    
4540 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4541     redo A;
4542 wakaba 1.1 } elsif ($self->{nc} == -1) {
4543    
4544     $self->{state} = DATA_STATE;
4545 wakaba 1.5 $self->{s_kwd} = '';
4546 wakaba 1.1 ## reconsume
4547    
4548     return ($self->{ct}); # DOCTYPE
4549    
4550     redo A;
4551     } else {
4552    
4553     my $s = '';
4554 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4555 wakaba 1.1
4556     ## Stay in the state
4557    
4558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559     $self->{line_prev} = $self->{line};
4560     $self->{column_prev} = $self->{column};
4561     $self->{column}++;
4562     $self->{nc}
4563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564     } else {
4565     $self->{set_nc}->($self);
4566     }
4567    
4568     redo A;
4569     }
4570     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4571     ## NOTE: "CDATA section state" in the state is jointly implemented
4572     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4573     ## and |CDATA_SECTION_MSE2_STATE|.
4574 wakaba 1.10
4575     ## XML5: "CDATA state".
4576 wakaba 1.1
4577     if ($self->{nc} == 0x005D) { # ]
4578    
4579     $self->{state} = CDATA_SECTION_MSE1_STATE;
4580    
4581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4582     $self->{line_prev} = $self->{line};
4583     $self->{column_prev} = $self->{column};
4584     $self->{column}++;
4585     $self->{nc}
4586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4587     } else {
4588     $self->{set_nc}->($self);
4589     }
4590    
4591     redo A;
4592     } elsif ($self->{nc} == -1) {
4593 wakaba 1.6 if ($self->{is_xml}) {
4594 wakaba 1.8
4595 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4596 wakaba 1.8 } else {
4597    
4598 wakaba 1.6 }
4599    
4600 wakaba 1.1 $self->{state} = DATA_STATE;
4601 wakaba 1.5 $self->{s_kwd} = '';
4602 wakaba 1.10 ## Reconsume.
4603 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4604    
4605     return ($self->{ct}); # character
4606     } else {
4607    
4608     ## No token to emit. $self->{ct} is discarded.
4609     }
4610     redo A;
4611     } else {
4612    
4613     $self->{ct}->{data} .= chr $self->{nc};
4614     $self->{read_until}->($self->{ct}->{data},
4615     q<]>,
4616     length $self->{ct}->{data});
4617    
4618     ## Stay in the state.
4619    
4620     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4621     $self->{line_prev} = $self->{line};
4622     $self->{column_prev} = $self->{column};
4623     $self->{column}++;
4624     $self->{nc}
4625     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4626     } else {
4627     $self->{set_nc}->($self);
4628     }
4629    
4630     redo A;
4631     }
4632    
4633     ## ISSUE: "text tokens" in spec.
4634     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4635 wakaba 1.10 ## XML5: "CDATA bracket state".
4636    
4637 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4638    
4639     $self->{state} = CDATA_SECTION_MSE2_STATE;
4640    
4641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642     $self->{line_prev} = $self->{line};
4643     $self->{column_prev} = $self->{column};
4644     $self->{column}++;
4645     $self->{nc}
4646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647     } else {
4648     $self->{set_nc}->($self);
4649     }
4650    
4651     redo A;
4652     } else {
4653    
4654 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4655 wakaba 1.1 $self->{ct}->{data} .= ']';
4656 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4657 wakaba 1.1 ## Reconsume.
4658     redo A;
4659     }
4660     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4661 wakaba 1.10 ## XML5: "CDATA end state".
4662    
4663 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4664     $self->{state} = DATA_STATE;
4665 wakaba 1.5 $self->{s_kwd} = '';
4666 wakaba 1.1
4667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4668     $self->{line_prev} = $self->{line};
4669     $self->{column_prev} = $self->{column};
4670     $self->{column}++;
4671     $self->{nc}
4672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4673     } else {
4674     $self->{set_nc}->($self);
4675     }
4676    
4677     if (length $self->{ct}->{data}) { # character
4678    
4679     return ($self->{ct}); # character
4680     } else {
4681    
4682     ## No token to emit. $self->{ct} is discarded.
4683     }
4684     redo A;
4685     } elsif ($self->{nc} == 0x005D) { # ]
4686     # character
4687     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4688     ## Stay in the state.
4689    
4690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691     $self->{line_prev} = $self->{line};
4692     $self->{column_prev} = $self->{column};
4693     $self->{column}++;
4694     $self->{nc}
4695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696     } else {
4697     $self->{set_nc}->($self);
4698     }
4699    
4700     redo A;
4701     } else {
4702    
4703     $self->{ct}->{data} .= ']]'; # character
4704     $self->{state} = CDATA_SECTION_STATE;
4705 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4706 wakaba 1.1 redo A;
4707     }
4708     } elsif ($self->{state} == ENTITY_STATE) {
4709     if ($is_space->{$self->{nc}} or
4710     {
4711     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4712     $self->{entity_add} => 1,
4713     }->{$self->{nc}}) {
4714 wakaba 1.22 if ($self->{is_xml}) {
4715    
4716     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4717     line => $self->{line_prev},
4718     column => $self->{column_prev}
4719     + ($self->{nc} == -1 ? 1 : 0));
4720     } else {
4721    
4722     ## No error
4723     }
4724 wakaba 1.1 ## Don't consume
4725     ## Return nothing.
4726     #
4727     } elsif ($self->{nc} == 0x0023) { # #
4728    
4729     $self->{state} = ENTITY_HASH_STATE;
4730 wakaba 1.12 $self->{kwd} = '#';
4731 wakaba 1.1
4732     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4733     $self->{line_prev} = $self->{line};
4734     $self->{column_prev} = $self->{column};
4735     $self->{column}++;
4736     $self->{nc}
4737     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4738     } else {
4739     $self->{set_nc}->($self);
4740     }
4741    
4742     redo A;
4743 wakaba 1.22 } elsif ($self->{is_xml} or
4744     (0x0041 <= $self->{nc} and
4745 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4746     (0x0061 <= $self->{nc} and
4747     $self->{nc} <= 0x007A)) { # a..z
4748    
4749     require Whatpm::_NamedEntityList;
4750     $self->{state} = ENTITY_NAME_STATE;
4751 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4752     $self->{entity__value} = $self->{kwd};
4753 wakaba 1.1 $self->{entity__match} = 0;
4754    
4755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4756     $self->{line_prev} = $self->{line};
4757     $self->{column_prev} = $self->{column};
4758     $self->{column}++;
4759     $self->{nc}
4760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4761     } else {
4762     $self->{set_nc}->($self);
4763     }
4764    
4765     redo A;
4766     } else {
4767    
4768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4769     ## Return nothing.
4770     #
4771     }
4772    
4773     ## NOTE: No character is consumed by the "consume a character
4774     ## reference" algorithm. In other word, there is an "&" character
4775     ## that does not introduce a character reference, which would be
4776     ## appended to the parent element or the attribute value in later
4777     ## process of the tokenizer.
4778    
4779     if ($self->{prev_state} == DATA_STATE) {
4780    
4781     $self->{state} = $self->{prev_state};
4782 wakaba 1.5 $self->{s_kwd} = '';
4783 wakaba 1.1 ## Reconsume.
4784     return ({type => CHARACTER_TOKEN, data => '&',
4785     line => $self->{line_prev},
4786     column => $self->{column_prev},
4787     });
4788     redo A;
4789     } else {
4790    
4791     $self->{ca}->{value} .= '&';
4792     $self->{state} = $self->{prev_state};
4793 wakaba 1.5 $self->{s_kwd} = '';
4794 wakaba 1.1 ## Reconsume.
4795     redo A;
4796     }
4797     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4798 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4799 wakaba 1.1
4800     $self->{state} = HEXREF_X_STATE;
4801 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4802 wakaba 1.1
4803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4804     $self->{line_prev} = $self->{line};
4805     $self->{column_prev} = $self->{column};
4806     $self->{column}++;
4807     $self->{nc}
4808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4809     } else {
4810     $self->{set_nc}->($self);
4811     }
4812    
4813     redo A;
4814 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4815    
4816     if ($self->{is_xml}) {
4817     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4818     }
4819     $self->{state} = HEXREF_X_STATE;
4820     $self->{kwd} .= chr $self->{nc};
4821    
4822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4823     $self->{line_prev} = $self->{line};
4824     $self->{column_prev} = $self->{column};
4825     $self->{column}++;
4826     $self->{nc}
4827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4828     } else {
4829     $self->{set_nc}->($self);
4830     }
4831    
4832     redo A;
4833 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4834     $self->{nc} <= 0x0039) { # 0..9
4835    
4836     $self->{state} = NCR_NUM_STATE;
4837 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4838 wakaba 1.1
4839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4840     $self->{line_prev} = $self->{line};
4841     $self->{column_prev} = $self->{column};
4842     $self->{column}++;
4843     $self->{nc}
4844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4845     } else {
4846     $self->{set_nc}->($self);
4847     }
4848    
4849     redo A;
4850     } else {
4851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4852     line => $self->{line_prev},
4853     column => $self->{column_prev} - 1);
4854    
4855     ## NOTE: According to the spec algorithm, nothing is returned,
4856     ## and then "&#" is appended to the parent element or the attribute
4857     ## value in the later processing.
4858    
4859     if ($self->{prev_state} == DATA_STATE) {
4860    
4861     $self->{state} = $self->{prev_state};
4862 wakaba 1.5 $self->{s_kwd} = '';
4863 wakaba 1.1 ## Reconsume.
4864     return ({type => CHARACTER_TOKEN,
4865     data => '&#',
4866     line => $self->{line_prev},
4867     column => $self->{column_prev} - 1,
4868     });
4869     redo A;
4870     } else {
4871    
4872     $self->{ca}->{value} .= '&#';
4873     $self->{state} = $self->{prev_state};
4874 wakaba 1.5 $self->{s_kwd} = '';
4875 wakaba 1.1 ## Reconsume.
4876     redo A;
4877     }
4878     }
4879     } elsif ($self->{state} == NCR_NUM_STATE) {
4880     if (0x0030 <= $self->{nc} and
4881     $self->{nc} <= 0x0039) { # 0..9
4882    
4883 wakaba 1.12 $self->{kwd} *= 10;
4884     $self->{kwd} += $self->{nc} - 0x0030;
4885 wakaba 1.1
4886     ## Stay in the state.
4887    
4888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4889     $self->{line_prev} = $self->{line};
4890     $self->{column_prev} = $self->{column};
4891     $self->{column}++;
4892     $self->{nc}
4893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4894     } else {
4895     $self->{set_nc}->($self);
4896     }
4897    
4898     redo A;
4899     } elsif ($self->{nc} == 0x003B) { # ;
4900    
4901    
4902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4903     $self->{line_prev} = $self->{line};
4904     $self->{column_prev} = $self->{column};
4905     $self->{column}++;
4906     $self->{nc}
4907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4908     } else {
4909     $self->{set_nc}->($self);
4910     }
4911    
4912     #
4913     } else {
4914    
4915     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4916     ## Reconsume.
4917     #
4918     }
4919    
4920 wakaba 1.12 my $code = $self->{kwd};
4921 wakaba 1.1 my $l = $self->{line_prev};
4922     my $c = $self->{column_prev};
4923 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4924     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4925     ($self->{is_xml} and $code == 0x0000)) {
4926 wakaba 1.1
4927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928     text => (sprintf 'U+%04X', $code),
4929     line => $l, column => $c);
4930     $code = $charref_map->{$code};
4931     } elsif ($code > 0x10FFFF) {
4932    
4933     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4934     text => (sprintf 'U-%08X', $code),
4935     line => $l, column => $c);
4936     $code = 0xFFFD;
4937     }
4938    
4939     if ($self->{prev_state} == DATA_STATE) {
4940    
4941     $self->{state} = $self->{prev_state};
4942 wakaba 1.5 $self->{s_kwd} = '';
4943 wakaba 1.1 ## Reconsume.
4944     return ({type => CHARACTER_TOKEN, data => chr $code,
4945 wakaba 1.7 has_reference => 1,
4946 wakaba 1.1 line => $l, column => $c,
4947     });
4948     redo A;
4949     } else {
4950    
4951     $self->{ca}->{value} .= chr $code;
4952     $self->{ca}->{has_reference} = 1;
4953     $self->{state} = $self->{prev_state};
4954 wakaba 1.5 $self->{s_kwd} = '';
4955 wakaba 1.1 ## Reconsume.
4956     redo A;
4957     }
4958     } elsif ($self->{state} == HEXREF_X_STATE) {
4959     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4960     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4961     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4962     # 0..9, A..F, a..f
4963    
4964     $self->{state} = HEXREF_HEX_STATE;
4965 wakaba 1.12 $self->{kwd} = 0;
4966 wakaba 1.1 ## Reconsume.
4967     redo A;
4968     } else {
4969     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4970     line => $self->{line_prev},
4971     column => $self->{column_prev} - 2);
4972    
4973     ## NOTE: According to the spec algorithm, nothing is returned,
4974     ## and then "&#" followed by "X" or "x" is appended to the parent
4975     ## element or the attribute value in the later processing.
4976    
4977     if ($self->{prev_state} == DATA_STATE) {
4978    
4979     $self->{state} = $self->{prev_state};
4980 wakaba 1.5 $self->{s_kwd} = '';
4981 wakaba 1.1 ## Reconsume.
4982     return ({type => CHARACTER_TOKEN,
4983 wakaba 1.12 data => '&' . $self->{kwd},
4984 wakaba 1.1 line => $self->{line_prev},
4985 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4986 wakaba 1.1 });
4987     redo A;
4988     } else {
4989    
4990 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4991 wakaba 1.1 $self->{state} = $self->{prev_state};
4992 wakaba 1.5 $self->{s_kwd} = '';
4993 wakaba 1.1 ## Reconsume.
4994     redo A;
4995     }
4996     }
4997     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4998     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4999     # 0..9
5000    
5001 wakaba 1.12 $self->{kwd} *= 0x10;
5002     $self->{kwd} += $self->{nc} - 0x0030;
5003 wakaba 1.1 ## Stay in the state.
5004    
5005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5006     $self->{line_prev} = $self->{line};
5007     $self->{column_prev} = $self->{column};
5008     $self->{column}++;
5009     $self->{nc}
5010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5011     } else {
5012     $self->{set_nc}->($self);
5013     }
5014    
5015     redo A;
5016     } elsif (0x0061 <= $self->{nc} and
5017     $self->{nc} <= 0x0066) { # a..f
5018    
5019 wakaba 1.12 $self->{kwd} *= 0x10;
5020     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5021 wakaba 1.1 ## Stay in the state.
5022    
5023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024     $self->{line_prev} = $self->{line};
5025     $self->{column_prev} = $self->{column};
5026     $self->{column}++;
5027     $self->{nc}
5028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029     } else {
5030     $self->{set_nc}->($self);
5031     }
5032    
5033     redo A;
5034     } elsif (0x0041 <= $self->{nc} and
5035     $self->{nc} <= 0x0046) { # A..F
5036    
5037 wakaba 1.12 $self->{kwd} *= 0x10;
5038     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5039 wakaba 1.1 ## Stay in the state.
5040    
5041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5042     $self->{line_prev} = $self->{line};
5043     $self->{column_prev} = $self->{column};
5044     $self->{column}++;
5045     $self->{nc}
5046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5047     } else {
5048     $self->{set_nc}->($self);
5049     }
5050    
5051     redo A;
5052     } elsif ($self->{nc} == 0x003B) { # ;
5053    
5054    
5055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5056     $self->{line_prev} = $self->{line};
5057     $self->{column_prev} = $self->{column};
5058     $self->{column}++;
5059     $self->{nc}
5060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5061     } else {
5062     $self->{set_nc}->($self);
5063     }
5064    
5065     #
5066     } else {
5067    
5068     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5069     line => $self->{line},
5070     column => $self->{column});
5071     ## Reconsume.
5072     #
5073     }
5074    
5075 wakaba 1.12 my $code = $self->{kwd};
5076 wakaba 1.1 my $l = $self->{line_prev};
5077     my $c = $self->{column_prev};
5078 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5079     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5080     ($self->{is_xml} and $code == 0x0000)) {
5081 wakaba 1.1
5082     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5083     text => (sprintf 'U+%04X', $code),
5084     line => $l, column => $c);
5085     $code = $charref_map->{$code};
5086     } elsif ($code > 0x10FFFF) {
5087    
5088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5089     text => (sprintf 'U-%08X', $code),
5090     line => $l, column => $c);
5091     $code = 0xFFFD;
5092     }
5093    
5094     if ($self->{prev_state} == DATA_STATE) {
5095    
5096     $self->{state} = $self->{prev_state};
5097 wakaba 1.5 $self->{s_kwd} = '';
5098 wakaba 1.1 ## Reconsume.
5099     return ({type => CHARACTER_TOKEN, data => chr $code,
5100 wakaba 1.7 has_reference => 1,
5101 wakaba 1.1 line => $l, column => $c,
5102     });
5103     redo A;
5104     } else {
5105    
5106     $self->{ca}->{value} .= chr $code;
5107     $self->{ca}->{has_reference} = 1;
5108     $self->{state} = $self->{prev_state};
5109 wakaba 1.5 $self->{s_kwd} = '';
5110 wakaba 1.1 ## Reconsume.
5111     redo A;
5112     }
5113     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5114 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5115     $self->{nc} <= 0x005A) or # x
5116     (0x0061 <= $self->{nc} and # a
5117     $self->{nc} <= 0x007A) or # z
5118     (0x0030 <= $self->{nc} and # 0
5119     $self->{nc} <= 0x0039) or # 9
5120 wakaba 1.22 $self->{nc} == 0x003B or # ;
5121     ($self->{is_xml} and
5122     not ($is_space->{$self->{nc}} or
5123     {
5124     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5125     $self->{entity_add} => 1,
5126     }->{$self->{nc}}))) {
5127 wakaba 1.1 our $EntityChar;
5128 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5129 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5130     $self->{ge}->{$self->{kwd}}) {
5131 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5132 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5133     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5134    
5135     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5136     } else {
5137     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5138    
5139     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5140     value => $self->{kwd});
5141     } else {
5142    
5143     }
5144     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5145     }
5146     } else {
5147     if ($self->{is_xml}) {
5148    
5149     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5150     value => $self->{kwd},
5151     level => {
5152     'amp;' => $self->{level}->{warn},
5153     'quot;' => $self->{level}->{warn},
5154     'lt;' => $self->{level}->{warn},
5155     'gt;' => $self->{level}->{warn},
5156     'apos;' => $self->{level}->{warn},
5157     }->{$self->{kwd}} ||
5158     $self->{level}->{must});
5159     } else {
5160    
5161     }
5162     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5163     }
5164 wakaba 1.1 $self->{entity__match} = 1;
5165    
5166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5167     $self->{line_prev} = $self->{line};
5168     $self->{column_prev} = $self->{column};
5169     $self->{column}++;
5170     $self->{nc}
5171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5172     } else {
5173     $self->{set_nc}->($self);
5174     }
5175    
5176     #
5177     } else {
5178    
5179 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5180 wakaba 1.1 $self->{entity__match} = -1;
5181     ## Stay in the state.
5182    
5183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5184     $self->{line_prev} = $self->{line};
5185     $self->{column_prev} = $self->{column};
5186     $self->{column}++;
5187     $self->{nc}
5188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5189     } else {
5190     $self->{set_nc}->($self);
5191     }
5192    
5193     redo A;
5194     }
5195     } else {
5196    
5197     $self->{entity__value} .= chr $self->{nc};
5198     $self->{entity__match} *= 2;
5199     ## Stay in the state.
5200    
5201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5202     $self->{line_prev} = $self->{line};
5203     $self->{column_prev} = $self->{column};
5204     $self->{column}++;
5205     $self->{nc}
5206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5207     } else {
5208     $self->{set_nc}->($self);
5209     }
5210    
5211     redo A;
5212     }
5213     }
5214    
5215     my $data;
5216     my $has_ref;
5217     if ($self->{entity__match} > 0) {
5218    
5219     $data = $self->{entity__value};
5220     $has_ref = 1;
5221     #
5222     } elsif ($self->{entity__match} < 0) {
5223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5224     if ($self->{prev_state} != DATA_STATE and # in attribute
5225     $self->{entity__match} < -1) {
5226    
5227 wakaba 1.12 $data = '&' . $self->{kwd};
5228 wakaba 1.1 #
5229     } else {
5230    
5231     $data = $self->{entity__value};
5232     $has_ref = 1;
5233     #
5234     }
5235     } else {
5236    
5237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5238     line => $self->{line_prev},
5239 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5240     $data = '&' . $self->{kwd};
5241 wakaba 1.1 #
5242     }
5243    
5244     ## NOTE: In these cases, when a character reference is found,
5245     ## it is consumed and a character token is returned, or, otherwise,
5246     ## nothing is consumed and returned, according to the spec algorithm.
5247     ## In this implementation, anything that has been examined by the
5248     ## tokenizer is appended to the parent element or the attribute value
5249     ## as string, either literal string when no character reference or
5250     ## entity-replaced string otherwise, in this stage, since any characters
5251     ## that would not be consumed are appended in the data state or in an
5252     ## appropriate attribute value state anyway.
5253    
5254     if ($self->{prev_state} == DATA_STATE) {
5255    
5256     $self->{state} = $self->{prev_state};
5257 wakaba 1.5 $self->{s_kwd} = '';
5258 wakaba 1.1 ## Reconsume.
5259     return ({type => CHARACTER_TOKEN,
5260     data => $data,
5261 wakaba 1.7 has_reference => $has_ref,
5262 wakaba 1.1 line => $self->{line_prev},
5263 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5264 wakaba 1.1 });
5265     redo A;
5266     } else {
5267    
5268     $self->{ca}->{value} .= $data;
5269     $self->{ca}->{has_reference} = 1 if $has_ref;
5270     $self->{state} = $self->{prev_state};
5271 wakaba 1.5 $self->{s_kwd} = '';
5272 wakaba 1.1 ## Reconsume.
5273     redo A;
5274     }
5275 wakaba 1.8
5276     ## XML-only states
5277    
5278     } elsif ($self->{state} == PI_STATE) {
5279 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5280    
5281 wakaba 1.8 if ($is_space->{$self->{nc}} or
5282 wakaba 1.14 $self->{nc} == 0x003F or # ?
5283 wakaba 1.8 $self->{nc} == -1) {
5284 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5285     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5286     ## "DOCTYPE pi state": Parse error, switch to the "data
5287     ## state".
5288 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5289     line => $self->{line_prev},
5290     column => $self->{column_prev}
5291     - 1 * ($self->{nc} != -1));
5292     $self->{state} = BOGUS_COMMENT_STATE;
5293     ## Reconsume.
5294     $self->{ct} = {type => COMMENT_TOKEN,
5295     data => '?',
5296     line => $self->{line_prev},
5297     column => $self->{column_prev}
5298     - 1 * ($self->{nc} != -1),
5299     };
5300     redo A;
5301     } else {
5302 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5303 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5304     target => chr $self->{nc},
5305     data => '',
5306     line => $self->{line_prev},
5307     column => $self->{column_prev} - 1,
5308     };
5309     $self->{state} = PI_TARGET_STATE;
5310    
5311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5312     $self->{line_prev} = $self->{line};
5313     $self->{column_prev} = $self->{column};
5314     $self->{column}++;
5315     $self->{nc}
5316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5317     } else {
5318     $self->{set_nc}->($self);
5319     }
5320    
5321     redo A;
5322     }
5323     } elsif ($self->{state} == PI_TARGET_STATE) {
5324     if ($is_space->{$self->{nc}}) {
5325     $self->{state} = PI_TARGET_AFTER_STATE;
5326    
5327     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5328     $self->{line_prev} = $self->{line};
5329     $self->{column_prev} = $self->{column};
5330     $self->{column}++;
5331     $self->{nc}
5332     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5333     } else {
5334     $self->{set_nc}->($self);
5335     }
5336    
5337     redo A;
5338     } elsif ($self->{nc} == -1) {
5339     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5340 wakaba 1.13 if ($self->{in_subset}) {
5341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5342     } else {
5343     $self->{state} = DATA_STATE;
5344     $self->{s_kwd} = '';
5345     }
5346 wakaba 1.8 ## Reconsume.
5347     return ($self->{ct}); # pi
5348     redo A;
5349     } elsif ($self->{nc} == 0x003F) { # ?
5350     $self->{state} = PI_AFTER_STATE;
5351    
5352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5353     $self->{line_prev} = $self->{line};
5354     $self->{column_prev} = $self->{column};
5355     $self->{column}++;
5356     $self->{nc}
5357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5358     } else {
5359     $self->{set_nc}->($self);
5360     }
5361    
5362     redo A;
5363     } else {
5364     ## XML5: typo ("tag name" -> "target")
5365     $self->{ct}->{target} .= chr $self->{nc}; # pi
5366    
5367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5368     $self->{line_prev} = $self->{line};
5369     $self->{column_prev} = $self->{column};
5370     $self->{column}++;
5371     $self->{nc}
5372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5373     } else {
5374     $self->{set_nc}->($self);
5375     }
5376    
5377     redo A;
5378     }
5379     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5380     if ($is_space->{$self->{nc}}) {
5381     ## Stay in the state.
5382    
5383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384     $self->{line_prev} = $self->{line};
5385     $self->{column_prev} = $self->{column};
5386     $self->{column}++;
5387     $self->{nc}
5388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389     } else {
5390     $self->{set_nc}->($self);
5391     }
5392    
5393     redo A;
5394     } else {
5395     $self->{state} = PI_DATA_STATE;
5396     ## Reprocess.
5397     redo A;
5398     }
5399     } elsif ($self->{state} == PI_DATA_STATE) {
5400     if ($self->{nc} == 0x003F) { # ?
5401     $self->{state} = PI_DATA_AFTER_STATE;
5402    
5403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5404     $self->{line_prev} = $self->{line};
5405     $self->{column_prev} = $self->{column};
5406     $self->{column}++;
5407     $self->{nc}
5408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5409     } else {
5410     $self->{set_nc}->($self);
5411     }
5412    
5413     redo A;
5414     } elsif ($self->{nc} == -1) {
5415     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5416 wakaba 1.13 if ($self->{in_subset}) {
5417 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5418 wakaba 1.13 } else {
5419     $self->{state} = DATA_STATE;
5420     $self->{s_kwd} = '';
5421     }
5422 wakaba 1.8 ## Reprocess.
5423     return ($self->{ct}); # pi
5424     redo A;
5425     } else {
5426     $self->{ct}->{data} .= chr $self->{nc}; # pi
5427     $self->{read_until}->($self->{ct}->{data}, q[?],
5428     length $self->{ct}->{data});
5429     ## Stay in the state.
5430    
5431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5432     $self->{line_prev} = $self->{line};
5433     $self->{column_prev} = $self->{column};
5434     $self->{column}++;
5435     $self->{nc}
5436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5437     } else {
5438     $self->{set_nc}->($self);
5439     }
5440    
5441     ## Reprocess.
5442     redo A;
5443     }
5444     } elsif ($self->{state} == PI_AFTER_STATE) {
5445 wakaba 1.14 ## XML5: Part of "Pi after state".
5446    
5447 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5448 wakaba 1.13 if ($self->{in_subset}) {
5449     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5450     } else {
5451     $self->{state} = DATA_STATE;
5452     $self->{s_kwd} = '';
5453     }
5454 wakaba 1.8
5455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5456     $self->{line_prev} = $self->{line};
5457     $self->{column_prev} = $self->{column};
5458     $self->{column}++;
5459     $self->{nc}
5460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5461     } else {
5462     $self->{set_nc}->($self);
5463     }
5464    
5465     return ($self->{ct}); # pi
5466     redo A;
5467     } elsif ($self->{nc} == 0x003F) { # ?
5468     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5469     line => $self->{line_prev},
5470     column => $self->{column_prev}); ## XML5: no error
5471     $self->{ct}->{data} .= '?';
5472     $self->{state} = PI_DATA_AFTER_STATE;
5473    
5474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475     $self->{line_prev} = $self->{line};
5476     $self->{column_prev} = $self->{column};
5477     $self->{column}++;
5478     $self->{nc}
5479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480     } else {
5481     $self->{set_nc}->($self);
5482     }
5483    
5484     redo A;
5485     } else {
5486     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5487     line => $self->{line_prev},
5488     column => $self->{column_prev}
5489     + 1 * ($self->{nc} == -1)); ## XML5: no error
5490     $self->{ct}->{data} .= '?'; ## XML5: not appended
5491     $self->{state} = PI_DATA_STATE;
5492     ## Reprocess.
5493     redo A;
5494     }
5495     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5496 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5497    
5498 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5499 wakaba 1.13 if ($self->{in_subset}) {
5500     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501     } else {
5502     $self->{state} = DATA_STATE;
5503     $self->{s_kwd} = '';
5504     }
5505 wakaba 1.8
5506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5507     $self->{line_prev} = $self->{line};
5508     $self->{column_prev} = $self->{column};
5509     $self->{column}++;
5510     $self->{nc}
5511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5512     } else {
5513     $self->{set_nc}->($self);
5514     }
5515    
5516     return ($self->{ct}); # pi
5517     redo A;
5518     } elsif ($self->{nc} == 0x003F) { # ?
5519     $self->{ct}->{data} .= '?';
5520     ## Stay in the state.
5521    
5522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5523     $self->{line_prev} = $self->{line};
5524     $self->{column_prev} = $self->{column};
5525     $self->{column}++;
5526     $self->{nc}
5527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5528     } else {
5529     $self->{set_nc}->($self);
5530     }
5531    
5532     redo A;
5533     } else {
5534     $self->{ct}->{data} .= '?'; ## XML5: not appended
5535     $self->{state} = PI_DATA_STATE;
5536     ## Reprocess.
5537     redo A;
5538     }
5539 wakaba 1.12
5540     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5541     if ($self->{nc} == 0x003C) { # <
5542 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5543 wakaba 1.12
5544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5545     $self->{line_prev} = $self->{line};
5546     $self->{column_prev} = $self->{column};
5547     $self->{column}++;
5548     $self->{nc}
5549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5550     } else {
5551     $self->{set_nc}->($self);
5552     }
5553    
5554     redo A;
5555     } elsif ($self->{nc} == 0x0025) { # %
5556     ## XML5: Not defined yet.
5557    
5558     ## TODO:
5559 wakaba 1.24
5560     if (not $self->{stop_processing} and
5561     not $self->{document}->xml_standalone) {
5562     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5563     level => $self->{level}->{info});
5564     $self->{stop_processing} = 1;
5565     }
5566    
5567 wakaba 1.12
5568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569     $self->{line_prev} = $self->{line};
5570     $self->{column_prev} = $self->{column};
5571     $self->{column}++;
5572     $self->{nc}
5573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5574     } else {
5575     $self->{set_nc}->($self);
5576     }
5577    
5578     redo A;
5579     } elsif ($self->{nc} == 0x005D) { # ]
5580 wakaba 1.13 delete $self->{in_subset};
5581 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5582    
5583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5584     $self->{line_prev} = $self->{line};
5585     $self->{column_prev} = $self->{column};
5586     $self->{column}++;
5587     $self->{nc}
5588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5589     } else {
5590     $self->{set_nc}->($self);
5591     }
5592    
5593     redo A;
5594     } elsif ($is_space->{$self->{nc}}) {
5595     ## Stay in the state.
5596    
5597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5598     $self->{line_prev} = $self->{line};
5599     $self->{column_prev} = $self->{column};
5600     $self->{column}++;
5601     $self->{nc}
5602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5603     } else {
5604     $self->{set_nc}->($self);
5605     }
5606    
5607     redo A;
5608     } elsif ($self->{nc} == -1) {
5609     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5610 wakaba 1.13 delete $self->{in_subset};
5611 wakaba 1.12 $self->{state} = DATA_STATE;
5612     $self->{s_kwd} = '';
5613     ## Reconsume.
5614 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5615 wakaba 1.12 redo A;
5616     } else {
5617     unless ($self->{internal_subset_tainted}) {
5618     ## XML5: No parse error.
5619     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5620     $self->{internal_subset_tainted} = 1;
5621     }
5622     ## Stay in the state.
5623    
5624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625     $self->{line_prev} = $self->{line};
5626     $self->{column_prev} = $self->{column};
5627     $self->{column}++;
5628     $self->{nc}
5629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630     } else {
5631     $self->{set_nc}->($self);
5632     }
5633    
5634     redo A;
5635     }
5636     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5637     if ($self->{nc} == 0x003E) { # >
5638     $self->{state} = DATA_STATE;
5639     $self->{s_kwd} = '';
5640    
5641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5642     $self->{line_prev} = $self->{line};
5643     $self->{column_prev} = $self->{column};
5644     $self->{column}++;
5645     $self->{nc}
5646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5647     } else {
5648     $self->{set_nc}->($self);
5649     }
5650    
5651 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5652 wakaba 1.12 redo A;
5653     } elsif ($self->{nc} == -1) {
5654     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5655     $self->{state} = DATA_STATE;
5656     $self->{s_kwd} = '';
5657     ## Reconsume.
5658 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5659 wakaba 1.12 redo A;
5660     } else {
5661     ## XML5: No parse error and stay in the state.
5662     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5663    
5664 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5665    
5666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667     $self->{line_prev} = $self->{line};
5668     $self->{column_prev} = $self->{column};
5669     $self->{column}++;
5670     $self->{nc}
5671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672     } else {
5673     $self->{set_nc}->($self);
5674     }
5675    
5676     redo A;
5677     }
5678     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5679     if ($self->{nc} == 0x003E) { # >
5680     $self->{state} = DATA_STATE;
5681     $self->{s_kwd} = '';
5682    
5683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5684     $self->{line_prev} = $self->{line};
5685     $self->{column_prev} = $self->{column};
5686     $self->{column}++;
5687     $self->{nc}
5688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5689     } else {
5690     $self->{set_nc}->($self);
5691     }
5692    
5693     return ({type => END_OF_DOCTYPE_TOKEN});
5694     redo A;
5695     } elsif ($self->{nc} == -1) {
5696     $self->{state} = DATA_STATE;
5697     $self->{s_kwd} = '';
5698     ## Reconsume.
5699     return ({type => END_OF_DOCTYPE_TOKEN});
5700     redo A;
5701     } else {
5702     ## Stay in the state.
5703    
5704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705     $self->{line_prev} = $self->{line};
5706     $self->{column_prev} = $self->{column};
5707     $self->{column}++;
5708     $self->{nc}
5709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710     } else {
5711     $self->{set_nc}->($self);
5712     }
5713    
5714     redo A;
5715     }
5716     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5717     if ($self->{nc} == 0x0021) { # !
5718 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5719 wakaba 1.13
5720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5721     $self->{line_prev} = $self->{line};
5722     $self->{column_prev} = $self->{column};
5723     $self->{column}++;
5724     $self->{nc}
5725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5726     } else {
5727     $self->{set_nc}->($self);
5728     }
5729    
5730     redo A;
5731     } elsif ($self->{nc} == 0x003F) { # ?
5732     $self->{state} = PI_STATE;
5733    
5734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5735     $self->{line_prev} = $self->{line};
5736     $self->{column_prev} = $self->{column};
5737     $self->{column}++;
5738     $self->{nc}
5739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5740     } else {
5741     $self->{set_nc}->($self);
5742     }
5743    
5744     redo A;
5745     } elsif ($self->{nc} == -1) {
5746     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5747     $self->{state} = DATA_STATE;
5748     $self->{s_kwd} = '';
5749     ## Reconsume.
5750     redo A;
5751     } else {
5752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5753     line => $self->{line_prev},
5754     column => $self->{column_prev});
5755     $self->{state} = BOGUS_COMMENT_STATE;
5756     $self->{ct} = {type => COMMENT_TOKEN,
5757     data => '',
5758     }; ## NOTE: Will be discarded.
5759 wakaba 1.12
5760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5761     $self->{line_prev} = $self->{line};
5762     $self->{column_prev} = $self->{column};
5763     $self->{column}++;
5764     $self->{nc}
5765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5766     } else {
5767     $self->{set_nc}->($self);
5768     }
5769    
5770     redo A;
5771     }
5772 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5773     ## XML5: "DOCTYPE markup declaration state".
5774    
5775     if ($self->{nc} == 0x002D) { # -
5776     $self->{state} = MD_HYPHEN_STATE;
5777    
5778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5779     $self->{line_prev} = $self->{line};
5780     $self->{column_prev} = $self->{column};
5781     $self->{column}++;
5782     $self->{nc}
5783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5784     } else {
5785     $self->{set_nc}->($self);
5786     }
5787    
5788     redo A;
5789 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5790     $self->{nc} == 0x0065) { # e
5791 wakaba 1.14 $self->{state} = MD_E_STATE;
5792     $self->{kwd} = chr $self->{nc};
5793    
5794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5795     $self->{line_prev} = $self->{line};
5796     $self->{column_prev} = $self->{column};
5797     $self->{column}++;
5798     $self->{nc}
5799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5800     } else {
5801     $self->{set_nc}->($self);
5802     }
5803    
5804     redo A;
5805 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5806     $self->{nc} == 0x0061) { # a
5807 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5808     $self->{kwd} = chr $self->{nc};
5809    
5810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5811     $self->{line_prev} = $self->{line};
5812     $self->{column_prev} = $self->{column};
5813     $self->{column}++;
5814     $self->{nc}
5815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5816     } else {
5817     $self->{set_nc}->($self);
5818     }
5819    
5820     redo A;
5821 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5822     $self->{nc} == 0x006E) { # n
5823 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5824     $self->{kwd} = chr $self->{nc};
5825    
5826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827     $self->{line_prev} = $self->{line};
5828     $self->{column_prev} = $self->{column};
5829     $self->{column}++;
5830     $self->{nc}
5831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832     } else {
5833     $self->{set_nc}->($self);
5834     }
5835    
5836     redo A;
5837     } else {
5838     #
5839     }
5840    
5841     ## XML5: No parse error.
5842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5843     line => $self->{line_prev},
5844     column => $self->{column_prev} - 1);
5845     ## Reconsume.
5846     $self->{state} = BOGUS_COMMENT_STATE;
5847     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5848     redo A;
5849     } elsif ($self->{state} == MD_E_STATE) {
5850 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5851     $self->{nc} == 0x006E) { # n
5852 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5853     $self->{kwd} .= chr $self->{nc};
5854    
5855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5856     $self->{line_prev} = $self->{line};
5857     $self->{column_prev} = $self->{column};
5858     $self->{column}++;
5859     $self->{nc}
5860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5861     } else {
5862     $self->{set_nc}->($self);
5863     }
5864    
5865     redo A;
5866 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5867     $self->{nc} == 0x006C) { # l
5868 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5869     $self->{state} = MD_ELEMENT_STATE;
5870     $self->{kwd} .= chr $self->{nc};
5871    
5872     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5873     $self->{line_prev} = $self->{line};
5874     $self->{column_prev} = $self->{column};
5875     $self->{column}++;
5876     $self->{nc}
5877     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5878     } else {
5879     $self->{set_nc}->($self);
5880     }
5881    
5882     redo A;
5883     } else {
5884     ## XML5: No parse error.
5885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5886     line => $self->{line_prev},
5887     column => $self->{column_prev} - 2
5888     + 1 * ($self->{nc} == -1));
5889     ## Reconsume.
5890     $self->{state} = BOGUS_COMMENT_STATE;
5891     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5892     redo A;
5893     }
5894     } elsif ($self->{state} == MD_ENTITY_STATE) {
5895 wakaba 1.17 if ($self->{nc} == [
5896     undef,
5897     undef,
5898     0x0054, # T
5899     0x0049, # I
5900     0x0054, # T
5901     ]->[length $self->{kwd}] or
5902     $self->{nc} == [
5903     undef,
5904     undef,
5905     0x0074, # t
5906     0x0069, # i
5907     0x0074, # t
5908     ]->[length $self->{kwd}]) {
5909 wakaba 1.14 ## Stay in the state.
5910     $self->{kwd} .= chr $self->{nc};
5911    
5912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5913     $self->{line_prev} = $self->{line};
5914     $self->{column_prev} = $self->{column};
5915     $self->{column}++;
5916     $self->{nc}
5917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5918     } else {
5919     $self->{set_nc}->($self);
5920     }
5921    
5922     redo A;
5923 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5924     ($self->{nc} == 0x0059 or # Y
5925     $self->{nc} == 0x0079)) { # y
5926     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5928     text => 'ENTITY',
5929     line => $self->{line_prev},
5930     column => $self->{column_prev} - 4);
5931     }
5932     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5933 wakaba 1.14 line => $self->{line_prev},
5934     column => $self->{column_prev} - 6};
5935     $self->{state} = DOCTYPE_MD_STATE;
5936    
5937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938     $self->{line_prev} = $self->{line};
5939     $self->{column_prev} = $self->{column};
5940     $self->{column}++;
5941     $self->{nc}
5942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943     } else {
5944     $self->{set_nc}->($self);
5945     }
5946    
5947     redo A;
5948     } else {
5949     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5950     line => $self->{line_prev},
5951     column => $self->{column_prev} - 1
5952     - (length $self->{kwd})
5953     + 1 * ($self->{nc} == -1));
5954     $self->{state} = BOGUS_COMMENT_STATE;
5955     ## Reconsume.
5956     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5957     redo A;
5958     }
5959     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5960 wakaba 1.17 if ($self->{nc} == [
5961     undef,
5962     undef,
5963     0x0045, # E
5964     0x004D, # M
5965     0x0045, # E
5966     0x004E, # N
5967     ]->[length $self->{kwd}] or
5968     $self->{nc} == [
5969     undef,
5970     undef,
5971     0x0065, # e
5972     0x006D, # m
5973     0x0065, # e
5974     0x006E, # n
5975     ]->[length $self->{kwd}]) {
5976 wakaba 1.14 ## Stay in the state.
5977     $self->{kwd} .= chr $self->{nc};
5978    
5979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980     $self->{line_prev} = $self->{line};
5981     $self->{column_prev} = $self->{column};
5982     $self->{column}++;
5983     $self->{nc}
5984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985     } else {
5986     $self->{set_nc}->($self);
5987     }
5988    
5989     redo A;
5990 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5991     ($self->{nc} == 0x0054 or # T
5992     $self->{nc} == 0x0074)) { # t
5993     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5994     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5995     text => 'ELEMENT',
5996     line => $self->{line_prev},
5997     column => $self->{column_prev} - 5);
5998     }
5999 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6000     line => $self->{line_prev},
6001 wakaba 1.23 column => $self->{column_prev} - 7};
6002 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6003    
6004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6005     $self->{line_prev} = $self->{line};
6006     $self->{column_prev} = $self->{column};
6007     $self->{column}++;
6008     $self->{nc}
6009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6010     } else {
6011     $self->{set_nc}->($self);
6012     }
6013    
6014     redo A;
6015     } else {
6016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6017     line => $self->{line_prev},
6018     column => $self->{column_prev} - 1
6019     - (length $self->{kwd})
6020     + 1 * ($self->{nc} == -1));
6021     $self->{state} = BOGUS_COMMENT_STATE;
6022     ## Reconsume.
6023     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6024     redo A;
6025     }
6026     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6027 wakaba 1.17 if ($self->{nc} == [
6028     undef,
6029     0x0054, # T
6030     0x0054, # T
6031     0x004C, # L
6032     0x0049, # I
6033     0x0053, # S
6034     ]->[length $self->{kwd}] or
6035     $self->{nc} == [
6036     undef,
6037     0x0074, # t
6038     0x0074, # t
6039     0x006C, # l
6040     0x0069, # i
6041     0x0073, # s
6042     ]->[length $self->{kwd}]) {
6043 wakaba 1.14 ## Stay in the state.
6044     $self->{kwd} .= chr $self->{nc};
6045    
6046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047     $self->{line_prev} = $self->{line};
6048     $self->{column_prev} = $self->{column};
6049     $self->{column}++;
6050     $self->{nc}
6051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052     } else {
6053     $self->{set_nc}->($self);
6054     }
6055    
6056     redo A;
6057 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6058     ($self->{nc} == 0x0054 or # T
6059     $self->{nc} == 0x0074)) { # t
6060     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6062     text => 'ATTLIST',
6063     line => $self->{line_prev},
6064     column => $self->{column_prev} - 5);
6065     }
6066 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6067 wakaba 1.15 attrdefs => [],
6068 wakaba 1.14 line => $self->{line_prev},
6069 wakaba 1.23 column => $self->{column_prev} - 7};
6070 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6071    
6072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073     $self->{line_prev} = $self->{line};
6074     $self->{column_prev} = $self->{column};
6075     $self->{column}++;
6076     $self->{nc}
6077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078     } else {
6079     $self->{set_nc}->($self);
6080     }
6081    
6082     redo A;
6083     } else {
6084     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6085     line => $self->{line_prev},
6086     column => $self->{column_prev} - 1
6087     - (length $self->{kwd})
6088     + 1 * ($self->{nc} == -1));
6089     $self->{state} = BOGUS_COMMENT_STATE;
6090     ## Reconsume.
6091     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6092     redo A;
6093     }
6094     } elsif ($self->{state} == MD_NOTATION_STATE) {
6095 wakaba 1.17 if ($self->{nc} == [
6096     undef,
6097     0x004F, # O
6098     0x0054, # T
6099     0x0041, # A
6100     0x0054, # T
6101     0x0049, # I
6102     0x004F, # O
6103     ]->[length $self->{kwd}] or
6104     $self->{nc} == [
6105     undef,
6106     0x006F, # o
6107     0x0074, # t
6108     0x0061, # a
6109     0x0074, # t
6110     0x0069, # i
6111     0x006F, # o
6112     ]->[length $self->{kwd}]) {
6113 wakaba 1.14 ## Stay in the state.
6114     $self->{kwd} .= chr $self->{nc};
6115    
6116     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6117     $self->{line_prev} = $self->{line};
6118     $self->{column_prev} = $self->{column};
6119     $self->{column}++;
6120     $self->{nc}
6121     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6122     } else {
6123     $self->{set_nc}->($self);
6124     }
6125    
6126     redo A;
6127 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6128     ($self->{nc} == 0x004E or # N
6129     $self->{nc} == 0x006E)) { # n
6130     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6131     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6132     text => 'NOTATION',
6133     line => $self->{line_prev},
6134     column => $self->{column_prev} - 6);
6135     }
6136 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6137     line => $self->{line_prev},
6138 wakaba 1.23 column => $self->{column_prev} - 8};
6139 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6140    
6141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6142     $self->{line_prev} = $self->{line};
6143     $self->{column_prev} = $self->{column};
6144     $self->{column}++;
6145     $self->{nc}
6146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6147     } else {
6148     $self->{set_nc}->($self);
6149     }
6150    
6151     redo A;
6152     } else {
6153     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6154     line => $self->{line_prev},
6155     column => $self->{column_prev} - 1
6156     - (length $self->{kwd})
6157     + 1 * ($self->{nc} == -1));
6158     $self->{state} = BOGUS_COMMENT_STATE;
6159     ## Reconsume.
6160     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6161     redo A;
6162     }
6163     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6164     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6165     ## "DOCTYPE NOTATION state".
6166    
6167     if ($is_space->{$self->{nc}}) {
6168     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6169     $self->{state} = BEFORE_MD_NAME_STATE;
6170    
6171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6172     $self->{line_prev} = $self->{line};
6173     $self->{column_prev} = $self->{column};
6174     $self->{column}++;
6175     $self->{nc}
6176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6177     } else {
6178     $self->{set_nc}->($self);
6179     }
6180    
6181     redo A;
6182     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6183     $self->{nc} == 0x0025) { # %
6184     ## XML5: Switch to the "DOCTYPE bogus comment state".
6185     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6186     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6187    
6188     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6189     $self->{line_prev} = $self->{line};
6190     $self->{column_prev} = $self->{column};
6191     $self->{column}++;
6192     $self->{nc}
6193     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6194     } else {
6195     $self->{set_nc}->($self);
6196     }
6197    
6198     redo A;
6199     } elsif ($self->{nc} == -1) {
6200     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6201     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6202     ## Reconsume.
6203     redo A;
6204     } elsif ($self->{nc} == 0x003E) { # >
6205     ## XML5: Switch to the "DOCTYPE bogus comment state".
6206     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6208    
6209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210     $self->{line_prev} = $self->{line};
6211     $self->{column_prev} = $self->{column};
6212     $self->{column}++;
6213     $self->{nc}
6214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215     } else {
6216     $self->{set_nc}->($self);
6217     }
6218    
6219     redo A;
6220     } else {
6221     ## XML5: Switch to the "DOCTYPE bogus comment state".
6222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6223     $self->{state} = BEFORE_MD_NAME_STATE;
6224     redo A;
6225     }
6226     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6227     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6228     ## before state", "DOCTYPE ATTLIST name before state".
6229    
6230     if ($is_space->{$self->{nc}}) {
6231     ## Stay in the state.
6232    
6233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6234     $self->{line_prev} = $self->{line};
6235     $self->{column_prev} = $self->{column};
6236     $self->{column}++;
6237     $self->{nc}
6238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6239     } else {
6240     $self->{set_nc}->($self);
6241     }
6242    
6243     redo A;
6244     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6245     $self->{nc} == 0x0025) { # %
6246     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6247    
6248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6249     $self->{line_prev} = $self->{line};
6250     $self->{column_prev} = $self->{column};
6251     $self->{column}++;
6252     $self->{nc}
6253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6254     } else {
6255     $self->{set_nc}->($self);
6256     }
6257    
6258     redo A;
6259     } elsif ($self->{nc} == 0x003E) { # >
6260     ## XML5: Same as "Anything else".
6261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6262     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263    
6264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265     $self->{line_prev} = $self->{line};
6266     $self->{column_prev} = $self->{column};
6267     $self->{column}++;
6268     $self->{nc}
6269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270     } else {
6271     $self->{set_nc}->($self);
6272     }
6273    
6274     redo A;
6275     } elsif ($self->{nc} == -1) {
6276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6277     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6278     ## Reconsume.
6279     redo A;
6280     } else {
6281     ## XML5: [ATTLIST] Not defined yet.
6282     $self->{ct}->{name} .= chr $self->{nc};
6283     $self->{state} = MD_NAME_STATE;
6284    
6285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6286     $self->{line_prev} = $self->{line};
6287     $self->{column_prev} = $self->{column};
6288     $self->{column}++;
6289     $self->{nc}
6290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6291     } else {
6292     $self->{set_nc}->($self);
6293     }
6294    
6295     redo A;
6296     }
6297     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6298     if ($is_space->{$self->{nc}}) {
6299     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6300     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6301     $self->{state} = BEFORE_MD_NAME_STATE;
6302 wakaba 1.8
6303 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304     $self->{line_prev} = $self->{line};
6305     $self->{column_prev} = $self->{column};
6306     $self->{column}++;
6307     $self->{nc}
6308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309     } else {
6310     $self->{set_nc}->($self);
6311     }
6312    
6313     redo A;
6314     } elsif ($self->{nc} == 0x003E) { # >
6315     ## XML5: Same as "Anything else".
6316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6317     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6318    
6319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320     $self->{line_prev} = $self->{line};
6321     $self->{column_prev} = $self->{column};
6322     $self->{column}++;
6323     $self->{nc}
6324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325     } else {
6326     $self->{set_nc}->($self);
6327     }
6328    
6329     redo A;
6330     } elsif ($self->{nc} == -1) {
6331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6332     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6333     ## Reconsume.
6334     redo A;
6335     } else {
6336     ## XML5: No parse error.
6337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6338     $self->{state} = BOGUS_COMMENT_STATE;
6339     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6340     ## Reconsume.
6341     redo A;
6342     }
6343     } elsif ($self->{state} == MD_NAME_STATE) {
6344     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6345    
6346     if ($is_space->{$self->{nc}}) {
6347 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6348     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6349     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6350 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6351 wakaba 1.16 } else { # ENTITY/NOTATION
6352     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6353     }
6354 wakaba 1.14
6355     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6356     $self->{line_prev} = $self->{line};
6357     $self->{column_prev} = $self->{column};
6358     $self->{column}++;
6359     $self->{nc}
6360     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6361     } else {
6362     $self->{set_nc}->($self);
6363     }
6364    
6365     redo A;
6366     } elsif ($self->{nc} == 0x003E) { # >
6367     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6368     #
6369     } else {
6370 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6371 wakaba 1.14 }
6372     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6373    
6374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375     $self->{line_prev} = $self->{line};
6376     $self->{column_prev} = $self->{column};
6377     $self->{column}++;
6378     $self->{nc}
6379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380     } else {
6381     $self->{set_nc}->($self);
6382     }
6383    
6384     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6385     redo A;
6386     } elsif ($self->{nc} == -1) {
6387     ## XML5: [ATTLIST] No parse error.
6388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6389     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6390     ## Reconsume.
6391     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6392     redo A;
6393     } else {
6394     ## XML5: [ATTLIST] Not defined yet.
6395     $self->{ct}->{name} .= chr $self->{nc};
6396     ## Stay in the state.
6397    
6398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399     $self->{line_prev} = $self->{line};
6400     $self->{column_prev} = $self->{column};
6401     $self->{column}++;
6402     $self->{nc}
6403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404     } else {
6405     $self->{set_nc}->($self);
6406     }
6407    
6408     redo A;
6409     }
6410     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6411     if ($is_space->{$self->{nc}}) {
6412     ## Stay in the state.
6413    
6414     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6415     $self->{line_prev} = $self->{line};
6416     $self->{column_prev} = $self->{column};
6417     $self->{column}++;
6418     $self->{nc}
6419     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6420     } else {
6421     $self->{set_nc}->($self);
6422     }
6423    
6424     redo A;
6425     } elsif ($self->{nc} == 0x003E) { # >
6426     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6427    
6428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429     $self->{line_prev} = $self->{line};
6430     $self->{column_prev} = $self->{column};
6431     $self->{column}++;
6432     $self->{nc}
6433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434     } else {
6435     $self->{set_nc}->($self);
6436     }
6437    
6438     return ($self->{ct}); # ATTLIST
6439     redo A;
6440     } elsif ($self->{nc} == -1) {
6441     ## XML5: No parse error.
6442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6443     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6444 wakaba 1.15 return ($self->{ct});
6445 wakaba 1.14 redo A;
6446     } else {
6447     ## XML5: Not defined yet.
6448 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6449     tokens => [],
6450     line => $self->{line}, column => $self->{column}};
6451     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6452    
6453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454     $self->{line_prev} = $self->{line};
6455     $self->{column_prev} = $self->{column};
6456     $self->{column}++;
6457     $self->{nc}
6458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459     } else {
6460     $self->{set_nc}->($self);
6461     }
6462    
6463     redo A;
6464     }
6465     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6466     if ($is_space->{$self->{nc}}) {
6467     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6468    
6469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470     $self->{line_prev} = $self->{line};
6471     $self->{column_prev} = $self->{column};
6472     $self->{column}++;
6473     $self->{nc}
6474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475     } else {
6476     $self->{set_nc}->($self);
6477     }
6478    
6479     redo A;
6480     } elsif ($self->{nc} == 0x003E) { # >
6481     ## XML5: Same as "anything else".
6482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6483     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6484    
6485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6486     $self->{line_prev} = $self->{line};
6487     $self->{column_prev} = $self->{column};
6488     $self->{column}++;
6489     $self->{nc}
6490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6491     } else {
6492     $self->{set_nc}->($self);
6493     }
6494    
6495     return ($self->{ct}); # ATTLIST
6496     redo A;
6497     } elsif ($self->{nc} == 0x0028) { # (
6498     ## XML5: Same as "anything else".
6499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6500     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6501    
6502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503     $self->{line_prev} = $self->{line};
6504     $self->{column_prev} = $self->{column};
6505     $self->{column}++;
6506     $self->{nc}
6507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508     } else {
6509     $self->{set_nc}->($self);
6510     }
6511    
6512     redo A;
6513     } elsif ($self->{nc} == -1) {
6514     ## XML5: No parse error.
6515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6516     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6517    
6518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519     $self->{line_prev} = $self->{line};
6520     $self->{column_prev} = $self->{column};
6521     $self->{column}++;
6522     $self->{nc}
6523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524     } else {
6525     $self->{set_nc}->($self);
6526     }
6527    
6528     return ($self->{ct}); # ATTLIST
6529     redo A;
6530     } else {
6531     ## XML5: Not defined yet.
6532     $self->{ca}->{name} .= chr $self->{nc};
6533     ## Stay in the state.
6534    
6535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536     $self->{line_prev} = $self->{line};
6537     $self->{column_prev} = $self->{column};
6538     $self->{column}++;
6539     $self->{nc}
6540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541     } else {
6542     $self->{set_nc}->($self);
6543     }
6544    
6545 wakaba 1.14 redo A;
6546     }
6547 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6548     if ($is_space->{$self->{nc}}) {
6549     ## Stay in the state.
6550    
6551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6552     $self->{line_prev} = $self->{line};
6553     $self->{column_prev} = $self->{column};
6554     $self->{column}++;
6555     $self->{nc}
6556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6557     } else {
6558     $self->{set_nc}->($self);
6559     }
6560    
6561     redo A;
6562     } elsif ($self->{nc} == 0x003E) { # >
6563     ## XML5: Same as "anything else".
6564     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6565     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6566    
6567     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568     $self->{line_prev} = $self->{line};
6569     $self->{column_prev} = $self->{column};
6570     $self->{column}++;
6571     $self->{nc}
6572     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573     } else {
6574     $self->{set_nc}->($self);
6575     }
6576    
6577     return ($self->{ct}); # ATTLIST
6578     redo A;
6579     } elsif ($self->{nc} == 0x0028) { # (
6580     ## XML5: Same as "anything else".
6581     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6582    
6583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584     $self->{line_prev} = $self->{line};
6585     $self->{column_prev} = $self->{column};
6586     $self->{column}++;
6587     $self->{nc}
6588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589     } else {
6590     $self->{set_nc}->($self);
6591     }
6592    
6593     redo A;
6594     } elsif ($self->{nc} == -1) {
6595     ## XML5: No parse error.
6596     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6597     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6598    
6599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600     $self->{line_prev} = $self->{line};
6601     $self->{column_prev} = $self->{column};
6602     $self->{column}++;
6603     $self->{nc}
6604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605     } else {
6606     $self->{set_nc}->($self);
6607     }
6608    
6609     return ($self->{ct});
6610     redo A;
6611     } else {
6612     ## XML5: Not defined yet.
6613     $self->{ca}->{type} = chr $self->{nc};
6614     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6615    
6616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617     $self->{line_prev} = $self->{line};
6618     $self->{column_prev} = $self->{column};
6619     $self->{column}++;
6620     $self->{nc}
6621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622     } else {
6623     $self->{set_nc}->($self);
6624     }
6625    
6626     redo A;
6627     }
6628     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6629     if ($is_space->{$self->{nc}}) {
6630     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6631    
6632     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6633     $self->{line_prev} = $self->{line};
6634     $self->{column_prev} = $self->{column};
6635     $self->{column}++;
6636     $self->{nc}
6637     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6638     } else {
6639     $self->{set_nc}->($self);
6640     }
6641    
6642     redo A;
6643     } elsif ($self->{nc} == 0x0023) { # #
6644     ## XML5: Same as "anything else".
6645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6646     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6647    
6648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6649     $self->{line_prev} = $self->{line};
6650     $self->{column_prev} = $self->{column};
6651     $self->{column}++;
6652     $self->{nc}
6653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6654     } else {
6655     $self->{set_nc}->($self);
6656     }
6657    
6658     redo A;
6659     } elsif ($self->{nc} == 0x0022) { # "
6660     ## XML5: Same as "anything else".
6661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6662     $self->{ca}->{value} = '';
6663     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6664    
6665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6666     $self->{line_prev} = $self->{line};
6667     $self->{column_prev} = $self->{column};
6668     $self->{column}++;
6669     $self->{nc}
6670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6671     } else {
6672     $self->{set_nc}->($self);
6673     }
6674    
6675     redo A;
6676     } elsif ($self->{nc} == 0x0027) { # '
6677     ## XML5: Same as "anything else".
6678     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6679     $self->{ca}->{value} = '';
6680     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6681    
6682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683     $self->{line_prev} = $self->{line};
6684     $self->{column_prev} = $self->{column};
6685     $self->{column}++;
6686     $self->{nc}
6687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688     } else {
6689     $self->{set_nc}->($self);
6690     }
6691    
6692     redo A;
6693     } elsif ($self->{nc} == 0x003E) { # >
6694     ## XML5: Same as "anything else".
6695     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6697    
6698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699     $self->{line_prev} = $self->{line};
6700     $self->{column_prev} = $self->{column};
6701     $self->{column}++;
6702     $self->{nc}
6703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704     } else {
6705     $self->{set_nc}->($self);
6706     }
6707    
6708     return ($self->{ct}); # ATTLIST
6709     redo A;
6710     } elsif ($self->{nc} == 0x0028) { # (
6711     ## XML5: Same as "anything else".
6712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6713     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6714    
6715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716     $self->{line_prev} = $self->{line};
6717     $self->{column_prev} = $self->{column};
6718     $self->{column}++;
6719     $self->{nc}
6720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721     } else {
6722     $self->{set_nc}->($self);
6723     }
6724    
6725     redo A;
6726     } elsif ($self->{nc} == -1) {
6727     ## XML5: No parse error.
6728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6730    
6731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732     $self->{line_prev} = $self->{line};
6733     $self->{column_prev} = $self->{column};
6734     $self->{column}++;
6735     $self->{nc}
6736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737     } else {
6738     $self->{set_nc}->($self);
6739     }
6740    
6741     return ($self->{ct});
6742     redo A;
6743     } else {
6744     ## XML5: Not defined yet.
6745     $self->{ca}->{type} .= chr $self->{nc};
6746     ## Stay in the state.
6747    
6748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6749     $self->{line_prev} = $self->{line};
6750     $self->{column_prev} = $self->{column};
6751     $self->{column}++;
6752     $self->{nc}
6753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6754     } else {
6755     $self->{set_nc}->($self);
6756     }
6757    
6758     redo A;
6759     }
6760     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6761     if ($is_space->{$self->{nc}}) {
6762     ## Stay in the state.
6763    
6764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765     $self->{line_prev} = $self->{line};
6766     $self->{column_prev} = $self->{column};
6767     $self->{column}++;
6768     $self->{nc}
6769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770     } else {
6771     $self->{set_nc}->($self);
6772     }
6773    
6774     redo A;
6775     } elsif ($self->{nc} == 0x0028) { # (
6776     ## XML5: Same as "anything else".
6777     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6778    
6779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780     $self->{line_prev} = $self->{line};
6781     $self->{column_prev} = $self->{column};
6782     $self->{column}++;
6783     $self->{nc}
6784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785     } else {
6786     $self->{set_nc}->($self);
6787     }
6788    
6789     redo A;
6790     } elsif ($self->{nc} == 0x0023) { # #
6791     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6792    
6793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794     $self->{line_prev} = $self->{line};
6795     $self->{column_prev} = $self->{column};
6796     $self->{column}++;
6797     $self->{nc}
6798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799     } else {
6800     $self->{set_nc}->($self);
6801     }
6802    
6803     redo A;
6804     } elsif ($self->{nc} == 0x0022) { # "
6805     ## XML5: Same as "anything else".
6806     $self->{ca}->{value} = '';
6807     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6808    
6809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810     $self->{line_prev} = $self->{line};
6811     $self->{column_prev} = $self->{column};
6812     $self->{column}++;
6813     $self->{nc}
6814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815     } else {
6816     $self->{set_nc}->($self);
6817     }
6818    
6819     redo A;
6820     } elsif ($self->{nc} == 0x0027) { # '
6821     ## XML5: Same as "anything else".
6822     $self->{ca}->{value} = '';
6823     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6824    
6825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826     $self->{line_prev} = $self->{line};
6827     $self->{column_prev} = $self->{column};
6828     $self->{column}++;
6829     $self->{nc}
6830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831     } else {
6832     $self->{set_nc}->($self);
6833     }
6834    
6835     redo A;
6836     } elsif ($self->{nc} == 0x003E) { # >
6837     ## XML5: Same as "anything else".
6838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6839     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6840    
6841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842     $self->{line_prev} = $self->{line};
6843     $self->{column_prev} = $self->{column};
6844     $self->{column}++;
6845     $self->{nc}
6846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847     } else {
6848     $self->{set_nc}->($self);
6849     }
6850    
6851     return ($self->{ct}); # ATTLIST
6852     redo A;
6853     } elsif ($self->{nc} == -1) {
6854     ## XML5: No parse error.
6855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6857    
6858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859     $self->{line_prev} = $self->{line};
6860     $self->{column_prev} = $self->{column};
6861     $self->{column}++;
6862     $self->{nc}
6863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864     } else {
6865     $self->{set_nc}->($self);
6866     }
6867    
6868     return ($self->{ct});
6869     redo A;
6870     } else {
6871     ## XML5: Switch to the "DOCTYPE bogus comment state".
6872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6873     $self->{ca}->{value} = '';
6874     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6875     ## Reconsume.
6876     redo A;
6877     }
6878     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6879     if ($is_space->{$self->{nc}}) {
6880     ## Stay in the state.
6881    
6882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6883     $self->{line_prev} = $self->{line};
6884     $self->{column_prev} = $self->{column};
6885     $self->{column}++;
6886     $self->{nc}
6887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6888     } else {
6889     $self->{set_nc}->($self);
6890     }
6891    
6892     redo A;
6893     } elsif ($self->{nc} == 0x007C) { # |
6894     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6895     ## Stay in the state.
6896    
6897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6898     $self->{line_prev} = $self->{line};
6899     $self->{column_prev} = $self->{column};
6900     $self->{column}++;
6901     $self->{nc}
6902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6903     } else {
6904     $self->{set_nc}->($self);
6905     }
6906    
6907     redo A;
6908     } elsif ($self->{nc} == 0x0029) { # )
6909     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6910     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6911    
6912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6913     $self->{line_prev} = $self->{line};
6914     $self->{column_prev} = $self->{column};
6915     $self->{column}++;
6916     $self->{nc}
6917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6918     } else {
6919     $self->{set_nc}->($self);
6920     }
6921    
6922     redo A;
6923     } elsif ($self->{nc} == 0x003E) { # >
6924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6926    
6927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928     $self->{line_prev} = $self->{line};
6929     $self->{column_prev} = $self->{column};
6930     $self->{column}++;
6931     $self->{nc}
6932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933     } else {
6934     $self->{set_nc}->($self);
6935     }
6936    
6937     return ($self->{ct}); # ATTLIST
6938     redo A;
6939     } elsif ($self->{nc} == -1) {
6940     ## XML5: No parse error.
6941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6942     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6943    
6944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6945     $self->{line_prev} = $self->{line};
6946     $self->{column_prev} = $self->{column};
6947     $self->{column}++;
6948     $self->{nc}
6949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6950     } else {
6951     $self->{set_nc}->($self);
6952     }
6953    
6954     return ($self->{ct});
6955     redo A;
6956     } else {
6957     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6958     $self->{state} = ALLOWED_TOKEN_STATE;
6959    
6960     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6961     $self->{line_prev} = $self->{line};
6962     $self->{column_prev} = $self->{column};
6963     $self->{column}++;
6964     $self->{nc}
6965     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6966     } else {
6967     $self->{set_nc}->($self);
6968     }
6969    
6970     redo A;
6971     }
6972     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6973     if ($is_space->{$self->{nc}}) {
6974     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6975    
6976     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6977     $self->{line_prev} = $self->{line};
6978     $self->{column_prev} = $self->{column};
6979     $self->{column}++;
6980     $self->{nc}
6981     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6982     } else {
6983     $self->{set_nc}->($self);
6984     }
6985    
6986     redo A;
6987     } elsif ($self->{nc} == 0x007C) { # |
6988     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6989    
6990     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6991     $self->{line_prev} = $self->{line};
6992     $self->{column_prev} = $self->{column};
6993     $self->{column}++;
6994     $self->{nc}
6995     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6996     } else {
6997     $self->{set_nc}->($self);
6998     }
6999    
7000     redo A;
7001     } elsif ($self->{nc} == 0x0029) { # )
7002     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7003    
7004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005     $self->{line_prev} = $self->{line};
7006     $self->{column_prev} = $self->{column};
7007     $self->{column}++;
7008     $self->{nc}
7009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010     } else {
7011     $self->{set_nc}->($self);
7012     }
7013    
7014     redo A;
7015     } elsif ($self->{nc} == 0x003E) { # >
7016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7017     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7018    
7019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020     $self->{line_prev} = $self->{line};
7021     $self->{column_prev} = $self->{column};
7022     $self->{column}++;
7023     $self->{nc}
7024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025     } else {
7026     $self->{set_nc}->($self);
7027     }
7028    
7029     return ($self->{ct}); # ATTLIST
7030     redo A;
7031     } elsif ($self->{nc} == -1) {
7032     ## XML5: No parse error.
7033     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7034     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7035    
7036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037     $self->{line_prev} = $self->{line};
7038     $self->{column_prev} = $self->{column};
7039     $self->{column}++;
7040     $self->{nc}
7041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042     } else {
7043     $self->{set_nc}->($self);
7044     }
7045    
7046     return ($self->{ct});
7047     redo A;
7048     } else {
7049     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7050     ## Stay in the state.
7051    
7052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7053     $self->{line_prev} = $self->{line};
7054     $self->{column_prev} = $self->{column};
7055     $self->{column}++;
7056     $self->{nc}
7057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7058     } else {
7059     $self->{set_nc}->($self);
7060     }
7061    
7062     redo A;
7063     }
7064     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7065     if ($is_space->{$self->{nc}}) {
7066     ## Stay in the state.
7067    
7068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069     $self->{line_prev} = $self->{line};
7070     $self->{column_prev} = $self->{column};
7071     $self->{column}++;
7072     $self->{nc}
7073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074     } else {
7075     $self->{set_nc}->($self);
7076     }
7077    
7078     redo A;
7079     } elsif ($self->{nc} == 0x007C) { # |
7080     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7081    
7082     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7083     $self->{line_prev} = $self->{line};
7084     $self->{column_prev} = $self->{column};
7085     $self->{column}++;
7086     $self->{nc}
7087     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7088     } else {
7089     $self->{set_nc}->($self);
7090     }
7091    
7092     redo A;
7093     } elsif ($self->{nc} == 0x0029) { # )
7094     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7095    
7096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7097     $self->{line_prev} = $self->{line};
7098     $self->{column_prev} = $self->{column};
7099     $self->{column}++;
7100     $self->{nc}
7101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7102     } else {
7103     $self->{set_nc}->($self);
7104     }
7105    
7106     redo A;
7107     } elsif ($self->{nc} == 0x003E) { # >
7108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7110    
7111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112     $self->{line_prev} = $self->{line};
7113     $self->{column_prev} = $self->{column};
7114     $self->{column}++;
7115     $self->{nc}
7116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117     } else {
7118     $self->{set_nc}->($self);
7119     }
7120    
7121     return ($self->{ct}); # ATTLIST
7122     redo A;
7123     } elsif ($self->{nc} == -1) {
7124     ## XML5: No parse error.
7125     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7126     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7127    
7128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129     $self->{line_prev} = $self->{line};
7130     $self->{column_prev} = $self->{column};
7131     $self->{column}++;
7132     $self->{nc}
7133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134     } else {
7135     $self->{set_nc}->($self);
7136     }
7137    
7138     return ($self->{ct});
7139     redo A;
7140     } else {
7141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7142     line => $self->{line_prev},
7143     column => $self->{column_prev});
7144     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7145     $self->{state} = ALLOWED_TOKEN_STATE;
7146    
7147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7148     $self->{line_prev} = $self->{line};
7149     $self->{column_prev} = $self->{column};
7150     $self->{column}++;
7151     $self->{nc}
7152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7153     } else {
7154     $self->{set_nc}->($self);
7155     }
7156    
7157     redo A;
7158     }
7159     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7160     if ($is_space->{$self->{nc}}) {
7161     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7162    
7163     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164     $self->{line_prev} = $self->{line};
7165     $self->{column_prev} = $self->{column};
7166     $self->{column}++;
7167     $self->{nc}
7168     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169     } else {
7170     $self->{set_nc}->($self);
7171     }
7172    
7173     redo A;
7174     } elsif ($self->{nc} == 0x0023) { # #
7175     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7176     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7177    
7178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179     $self->{line_prev} = $self->{line};
7180     $self->{column_prev} = $self->{column};
7181     $self->{column}++;
7182     $self->{nc}
7183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184     } else {
7185     $self->{set_nc}->($self);
7186     }
7187    
7188     redo A;
7189     } elsif ($self->{nc} == 0x0022) { # "
7190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7191     $self->{ca}->{value} = '';
7192     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7193    
7194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7195     $self->{line_prev} = $self->{line};
7196     $self->{column_prev} = $self->{column};
7197     $self->{column}++;
7198     $self->{nc}
7199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7200     } else {
7201     $self->{set_nc}->($self);
7202     }
7203    
7204     redo A;
7205     } elsif ($self->{nc} == 0x0027) { # '
7206     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7207     $self->{ca}->{value} = '';
7208     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7209    
7210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7211     $self->{line_prev} = $self->{line};
7212     $self->{column_prev} = $self->{column};
7213     $self->{column}++;
7214     $self->{nc}
7215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7216     } else {
7217     $self->{set_nc}->($self);
7218     }
7219    
7220     redo A;
7221     } elsif ($self->{nc} == 0x003E) { # >
7222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7224    
7225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7226     $self->{line_prev} = $self->{line};
7227     $self->{column_prev} = $self->{column};
7228     $self->{column}++;
7229     $self->{nc}
7230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7231     } else {
7232     $self->{set_nc}->($self);
7233     }
7234    
7235     return ($self->{ct}); # ATTLIST
7236     redo A;
7237     } elsif ($self->{nc} == -1) {
7238     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7240    
7241     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242     $self->{line_prev} = $self->{line};
7243     $self->{column_prev} = $self->{column};
7244     $self->{column}++;
7245     $self->{nc}
7246     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247     } else {
7248     $self->{set_nc}->($self);
7249     }
7250    
7251     return ($self->{ct});
7252     redo A;
7253     } else {
7254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7255     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7256     ## Reconsume.
7257     redo A;
7258     }
7259     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7260     if ($is_space->{$self->{nc}}) {
7261     ## Stay in the state.
7262    
7263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7264     $self->{line_prev} = $self->{line};
7265     $self->{column_prev} = $self->{column};
7266     $self->{column}++;
7267     $self->{nc}
7268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7269     } else {
7270     $self->{set_nc}->($self);
7271     }
7272    
7273     redo A;
7274     } elsif ($self->{nc} == 0x0023) { # #
7275     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7276    
7277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7278     $self->{line_prev} = $self->{line};
7279     $self->{column_prev} = $self->{column};
7280     $self->{column}++;
7281     $self->{nc}
7282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7283     } else {
7284     $self->{set_nc}->($self);
7285     }
7286    
7287     redo A;
7288     } elsif ($self->{nc} == 0x0022) { # "
7289     $self->{ca}->{value} = '';
7290     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7291    
7292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293     $self->{line_prev} = $self->{line};
7294     $self->{column_prev} = $self->{column};
7295     $self->{column}++;
7296     $self->{nc}
7297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298     } else {
7299     $self->{set_nc}->($self);
7300     }
7301    
7302     redo A;
7303     } elsif ($self->{nc} == 0x0027) { # '
7304     $self->{ca}->{value} = '';
7305     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7306    
7307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308     $self->{line_prev} = $self->{line};
7309     $self->{column_prev} = $self->{column};
7310     $self->{column}++;
7311     $self->{nc}
7312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313     } else {
7314     $self->{set_nc}->($self);
7315     }
7316    
7317     redo A;
7318     } elsif ($self->{nc} == 0x003E) { # >
7319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7320     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7321    
7322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323     $self->{line_prev} = $self->{line};
7324     $self->{column_prev} = $self->{column};
7325     $self->{column}++;
7326     $self->{nc}
7327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328     } else {
7329     $self->{set_nc}->($self);
7330     }
7331    
7332     return ($self->{ct}); # ATTLIST
7333     redo A;
7334     } elsif ($self->{nc} == -1) {
7335     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7336     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7337    
7338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339     $self->{line_prev} = $self->{line};
7340     $self->{column_prev} = $self->{column};
7341     $self->{column}++;
7342     $self->{nc}
7343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344     } else {
7345     $self->{set_nc}->($self);
7346     }
7347    
7348     return ($self->{ct});
7349     redo A;
7350     } else {
7351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7352     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7353     ## Reconsume.
7354     redo A;
7355     }
7356     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7357     if ($is_space->{$self->{nc}}) {
7358     ## XML5: No parse error.
7359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7360 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7361 wakaba 1.15 ## Reconsume.
7362     redo A;
7363     } elsif ($self->{nc} == 0x0022) { # "
7364     ## XML5: Same as "anything else".
7365     $self->{ca}->{value} = '';
7366     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7367    
7368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369     $self->{line_prev} = $self->{line};
7370     $self->{column_prev} = $self->{column};
7371     $self->{column}++;
7372     $self->{nc}
7373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374     } else {
7375     $self->{set_nc}->($self);
7376     }
7377    
7378     redo A;
7379     } elsif ($self->{nc} == 0x0027) { # '
7380     ## XML5: Same as "anything else".
7381     $self->{ca}->{value} = '';
7382     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7383    
7384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385     $self->{line_prev} = $self->{line};
7386     $self->{column_prev} = $self->{column};
7387     $self->{column}++;
7388     $self->{nc}
7389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390     } else {
7391     $self->{set_nc}->($self);
7392     }
7393    
7394     redo A;
7395     } elsif ($self->{nc} == 0x003E) { # >
7396     ## XML5: Same as "anything else".
7397     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399    
7400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401     $self->{line_prev} = $self->{line};
7402     $self->{column_prev} = $self->{column};
7403     $self->{column}++;
7404     $self->{nc}
7405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406     } else {
7407     $self->{set_nc}->($self);
7408     }
7409    
7410     return ($self->{ct}); # ATTLIST
7411     redo A;
7412     } elsif ($self->{nc} == -1) {
7413     ## XML5: No parse error.
7414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7415     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7416    
7417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418     $self->{line_prev} = $self->{line};
7419     $self->{column_prev} = $self->{column};
7420     $self->{column}++;
7421     $self->{nc}
7422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423     } else {
7424     $self->{set_nc}->($self);
7425     }
7426    
7427     return ($self->{ct});
7428     redo A;
7429     } else {
7430     $self->{ca}->{default} = chr $self->{nc};
7431     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7432    
7433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7434     $self->{line_prev} = $self->{line};
7435     $self->{column_prev} = $self->{column};
7436     $self->{column}++;
7437     $self->{nc}
7438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7439     } else {
7440     $self->{set_nc}->($self);
7441     }
7442    
7443     redo A;
7444     }
7445     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7446     if ($is_space->{$self->{nc}}) {
7447     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7448    
7449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7450     $self->{line_prev} = $self->{line};
7451     $self->{column_prev} = $self->{column};
7452     $self->{column}++;
7453     $self->{nc}
7454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7455     } else {
7456     $self->{set_nc}->($self);
7457     }
7458    
7459     redo A;
7460     } elsif ($self->{nc} == 0x0022) { # "
7461     ## XML5: Same as "anything else".
7462     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7463     $self->{ca}->{value} = '';
7464     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7465    
7466     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467     $self->{line_prev} = $self->{line};
7468     $self->{column_prev} = $self->{column};
7469     $self->{column}++;
7470     $self->{nc}
7471     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472     } else {
7473     $self->{set_nc}->($self);
7474     }
7475    
7476     redo A;
7477     } elsif ($self->{nc} == 0x0027) { # '
7478     ## XML5: Same as "anything else".
7479     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7480     $self->{ca}->{value} = '';
7481     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7482    
7483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7484     $self->{line_prev} = $self->{line};
7485     $self->{column_prev} = $self->{column};
7486     $self->{column}++;
7487     $self->{nc}
7488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7489     } else {
7490     $self->{set_nc}->($self);
7491     }
7492    
7493     redo A;
7494     } elsif ($self->{nc} == 0x003E) { # >
7495     ## XML5: Same as "anything else".
7496     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7498    
7499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7500     $self->{line_prev} = $self->{line};
7501     $self->{column_prev} = $self->{column};
7502     $self->{column}++;
7503     $self->{nc}
7504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7505     } else {
7506     $self->{set_nc}->($self);
7507     }
7508    
7509     return ($self->{ct}); # ATTLIST
7510     redo A;
7511     } elsif ($self->{nc} == -1) {
7512     ## XML5: No parse error.
7513     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7514     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7515     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7516    
7517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7518     $self->{line_prev} = $self->{line};
7519     $self->{column_prev} = $self->{column};
7520     $self->{column}++;
7521     $self->{nc}
7522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7523     } else {
7524     $self->{set_nc}->($self);
7525     }
7526    
7527     return ($self->{ct});
7528     redo A;
7529     } else {
7530     $self->{ca}->{default} .= chr $self->{nc};
7531     ## Stay in the state.
7532    
7533     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7534     $self->{line_prev} = $self->{line};
7535     $self->{column_prev} = $self->{column};
7536     $self->{column}++;
7537     $self->{nc}
7538     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7539     } else {
7540     $self->{set_nc}->($self);
7541     }
7542    
7543     redo A;
7544     }
7545     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7546     if ($is_space->{$self->{nc}}) {
7547     ## Stay in the state.
7548    
7549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7550     $self->{line_prev} = $self->{line};
7551     $self->{column_prev} = $self->{column};
7552     $self->{column}++;
7553     $self->{nc}
7554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7555     } else {
7556     $self->{set_nc}->($self);
7557     }
7558    
7559     redo A;
7560     } elsif ($self->{nc} == 0x0022) { # "
7561     $self->{ca}->{value} = '';
7562     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7563    
7564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7565     $self->{line_prev} = $self->{line};
7566     $self->{column_prev} = $self->{column};
7567     $self->{column}++;
7568     $self->{nc}
7569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7570     } else {
7571     $self->{set_nc}->($self);
7572     }
7573    
7574     redo A;
7575     } elsif ($self->{nc} == 0x0027) { # '
7576     $self->{ca}->{value} = '';
7577     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7578    
7579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7580     $self->{line_prev} = $self->{line};
7581     $self->{column_prev} = $self->{column};
7582     $self->{column}++;
7583     $self->{nc}
7584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7585     } else {
7586     $self->{set_nc}->($self);
7587     }
7588    
7589     redo A;
7590     } elsif ($self->{nc} == 0x003E) { # >
7591     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7592     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7593    
7594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7595     $self->{line_prev} = $self->{line};
7596     $self->{column_prev} = $self->{column};
7597     $self->{column}++;
7598     $self->{nc}
7599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7600     } else {
7601     $self->{set_nc}->($self);
7602     }
7603    
7604     return ($self->{ct}); # ATTLIST
7605     redo A;
7606     } elsif ($self->{nc} == -1) {
7607     ## XML5: No parse error.
7608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7609     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7610     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7611    
7612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7613     $self->{line_prev} = $self->{line};
7614     $self->{column_prev} = $self->{column};
7615     $self->{column}++;
7616     $self->{nc}
7617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7618     } else {
7619     $self->{set_nc}->($self);
7620     }
7621    
7622     return ($self->{ct});
7623     redo A;
7624     } else {
7625     ## XML5: Not defined yet.
7626     if ($self->{ca}->{default} eq 'FIXED') {
7627     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7628     } else {
7629     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7630     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7631     }
7632     ## Reconsume.
7633     redo A;
7634     }
7635     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7636     if ($is_space->{$self->{nc}} or
7637     $self->{nc} == -1 or
7638     $self->{nc} == 0x003E) { # >
7639     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7640     ## Reconsume.
7641     redo A;
7642     } else {
7643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7644     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7645     ## Reconsume.
7646     redo A;
7647 wakaba 1.16 }
7648 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7649     ## ASCII case-insensitive
7650     if ($self->{nc} == [
7651     undef,
7652     0x0044, # D
7653     0x0041, # A
7654     0x0054, # T
7655     ]->[length $self->{kwd}] or
7656     $self->{nc} == [
7657     undef,
7658     0x0064, # d
7659     0x0061, # a
7660     0x0074, # t
7661     ]->[length $self->{kwd}]) {
7662    
7663     ## Stay in the state.
7664     $self->{kwd} .= chr $self->{nc};
7665    
7666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7667     $self->{line_prev} = $self->{line};
7668     $self->{column_prev} = $self->{column};
7669     $self->{column}++;
7670     $self->{nc}
7671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7672     } else {
7673     $self->{set_nc}->($self);
7674     }
7675    
7676     redo A;
7677     } elsif ((length $self->{kwd}) == 4 and
7678     ($self->{nc} == 0x0041 or # A
7679     $self->{nc} == 0x0061)) { # a
7680     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7681    
7682     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7683     text => 'NDATA',
7684     line => $self->{line_prev},
7685     column => $self->{column_prev} - 4);
7686     } else {
7687    
7688     }
7689     $self->{state} = AFTER_NDATA_STATE;
7690    
7691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7692     $self->{line_prev} = $self->{line};
7693     $self->{column_prev} = $self->{column};
7694     $self->{column}++;
7695     $self->{nc}
7696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7697     } else {
7698     $self->{set_nc}->($self);
7699     }
7700    
7701     redo A;
7702     } else {
7703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7704     line => $self->{line_prev},
7705     column => $self->{column_prev} + 1
7706     - length $self->{kwd});
7707    
7708     $self->{state} = BOGUS_MD_STATE;
7709     ## Reconsume.
7710     redo A;
7711     }
7712     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7713     if ($is_space->{$self->{nc}}) {
7714     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7715    
7716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7717     $self->{line_prev} = $self->{line};
7718     $self->{column_prev} = $self->{column};
7719     $self->{column}++;
7720     $self->{nc}
7721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7722     } else {
7723     $self->{set_nc}->($self);
7724     }
7725    
7726     redo A;
7727     } elsif ($self->{nc} == 0x003E) { # >
7728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7730    
7731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7732     $self->{line_prev} = $self->{line};
7733     $self->{column_prev} = $self->{column};
7734     $self->{column}++;
7735     $self->{nc}
7736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7737     } else {
7738     $self->{set_nc}->($self);
7739     }
7740    
7741     return ($self->{ct}); # ENTITY
7742     redo A;
7743     } elsif ($self->{nc} == -1) {
7744     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7746    
7747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7748     $self->{line_prev} = $self->{line};
7749     $self->{column_prev} = $self->{column};
7750     $self->{column}++;
7751     $self->{nc}
7752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7753     } else {
7754     $self->{set_nc}->($self);
7755     }
7756    
7757     return ($self->{ct}); # ENTITY
7758     redo A;
7759     } else {
7760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7761     line => $self->{line_prev},
7762     column => $self->{column_prev} + 1
7763     - length $self->{kwd});
7764     $self->{state} = BOGUS_MD_STATE;
7765     ## Reconsume.
7766     redo A;
7767     }
7768     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7769     if ($is_space->{$self->{nc}}) {
7770     ## Stay in the state.
7771    
7772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773     $self->{line_prev} = $self->{line};
7774     $self->{column_prev} = $self->{column};
7775     $self->{column}++;
7776     $self->{nc}
7777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778     } else {
7779     $self->{set_nc}->($self);
7780     }
7781    
7782     redo A;
7783     } elsif ($self->{nc} == 0x003E) { # >
7784     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7785     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7786    
7787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788     $self->{line_prev} = $self->{line};
7789     $self->{column_prev} = $self->{column};
7790     $self->{column}++;
7791     $self->{nc}
7792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793     } else {
7794     $self->{set_nc}->($self);
7795     }
7796    
7797     return ($self->{ct}); # ENTITY
7798     redo A;
7799     } elsif ($self->{nc} == -1) {
7800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7801     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7802    
7803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804     $self->{line_prev} = $self->{line};
7805     $self->{column_prev} = $self->{column};
7806     $self->{column}++;
7807     $self->{nc}
7808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809     } else {
7810     $self->{set_nc}->($self);
7811     }
7812    
7813     return ($self->{ct}); # ENTITY
7814     redo A;
7815     } else {
7816     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7817     $self->{state} = NOTATION_NAME_STATE;
7818    
7819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7820     $self->{line_prev} = $self->{line};
7821     $self->{column_prev} = $self->{column};
7822     $self->{column}++;
7823     $self->{nc}
7824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7825     } else {
7826     $self->{set_nc}->($self);
7827     }
7828    
7829     redo A;
7830     }
7831     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7832     if ($is_space->{$self->{nc}}) {
7833 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7834 wakaba 1.18
7835     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7836     $self->{line_prev} = $self->{line};
7837     $self->{column_prev} = $self->{column};
7838     $self->{column}++;
7839     $self->{nc}
7840     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7841     } else {
7842     $self->{set_nc}->($self);
7843     }
7844    
7845     redo A;
7846     } elsif ($self->{nc} == 0x003E) { # >
7847     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7848    
7849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7850     $self->{line_prev} = $self->{line};
7851     $self->{column_prev} = $self->{column};
7852     $self->{column}++;
7853     $self->{nc}
7854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7855     } else {
7856     $self->{set_nc}->($self);
7857     }
7858    
7859     return ($self->{ct}); # ENTITY
7860     redo A;
7861     } elsif ($self->{nc} == -1) {
7862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7863     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7864    
7865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7866     $self->{line_prev} = $self->{line};
7867     $self->{column_prev} = $self->{column};
7868     $self->{column}++;
7869     $self->{nc}
7870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7871     } else {
7872     $self->{set_nc}->($self);
7873     }
7874    
7875     return ($self->{ct}); # ENTITY
7876     redo A;
7877     } else {
7878     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7879     ## Stay in the state.
7880    
7881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7882     $self->{line_prev} = $self->{line};
7883     $self->{column_prev} = $self->{column};
7884     $self->{column}++;
7885     $self->{nc}
7886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7887     } else {
7888     $self->{set_nc}->($self);
7889     }
7890    
7891     redo A;
7892     }
7893 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7894     if ($self->{nc} == 0x0022) { # "
7895 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7896 wakaba 1.19
7897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898     $self->{line_prev} = $self->{line};
7899     $self->{column_prev} = $self->{column};
7900     $self->{column}++;
7901     $self->{nc}
7902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903     } else {
7904     $self->{set_nc}->($self);
7905     }
7906    
7907     redo A;
7908     } elsif ($self->{nc} == 0x0026) { # &
7909     $self->{prev_state} = $self->{state};
7910     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7911     $self->{entity_add} = 0x0022; # "
7912    
7913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7914     $self->{line_prev} = $self->{line};
7915     $self->{column_prev} = $self->{column};
7916     $self->{column}++;
7917     $self->{nc}
7918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7919     } else {
7920     $self->{set_nc}->($self);
7921     }
7922    
7923     redo A;
7924     ## TODO: %
7925     } elsif ($self->{nc} == -1) {
7926     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7927     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7928     ## Reconsume.
7929     return ($self->{ct}); # ENTITY
7930     redo A;
7931     } else {
7932     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7933    
7934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7935     $self->{line_prev} = $self->{line};
7936     $self->{column_prev} = $self->{column};
7937     $self->{column}++;
7938     $self->{nc}
7939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7940     } else {
7941     $self->{set_nc}->($self);
7942     }
7943    
7944     redo A;
7945     }
7946     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7947     if ($self->{nc} == 0x0027) { # '
7948 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7949 wakaba 1.19
7950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7951     $self->{line_prev} = $self->{line};
7952     $self->{column_prev} = $self->{column};
7953     $self->{column}++;
7954     $self->{nc}
7955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7956     } else {
7957     $self->{set_nc}->($self);
7958     }
7959    
7960     redo A;
7961     } elsif ($self->{nc} == 0x0026) { # &
7962     $self->{prev_state} = $self->{state};
7963     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7964     $self->{entity_add} = 0x0027; # '
7965    
7966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7967     $self->{line_prev} = $self->{line};
7968     $self->{column_prev} = $self->{column};
7969     $self->{column}++;
7970     $self->{nc}
7971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7972     } else {
7973     $self->{set_nc}->($self);
7974     }
7975    
7976     redo A;
7977     ## TODO: %
7978     } elsif ($self->{nc} == -1) {
7979     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7981     ## Reconsume.
7982     return ($self->{ct}); # ENTITY
7983     redo A;
7984     } else {
7985     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7986    
7987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7988     $self->{line_prev} = $self->{line};
7989     $self->{column_prev} = $self->{column};
7990     $self->{column}++;
7991     $self->{nc}
7992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7993     } else {
7994     $self->{set_nc}->($self);
7995     }
7996    
7997     redo A;
7998     }
7999     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8000     if ($is_space->{$self->{nc}} or
8001     {
8002     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8003     $self->{entity_add} => 1,
8004     }->{$self->{nc}}) {
8005 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8006     line => $self->{line_prev},
8007     column => $self->{column_prev}
8008     + ($self->{nc} == -1 ? 1 : 0));
8009 wakaba 1.19 ## Don't consume
8010     ## Return nothing.
8011     #
8012     } elsif ($self->{nc} == 0x0023) { # #
8013     $self->{ca} = $self->{ct};
8014     $self->{state} = ENTITY_HASH_STATE;
8015     $self->{kwd} = '#';
8016    
8017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8018     $self->{line_prev} = $self->{line};
8019     $self->{column_prev} = $self->{column};
8020     $self->{column}++;
8021     $self->{nc}
8022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8023     } else {
8024     $self->{set_nc}->($self);
8025     }
8026    
8027     redo A;
8028     } else {
8029     #
8030     }
8031    
8032     $self->{ct}->{value} .= '&';
8033     $self->{state} = $self->{prev_state};
8034     ## Reconsume.
8035     redo A;
8036 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8037     if ($is_space->{$self->{nc}}) {
8038     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8039    
8040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041     $self->{line_prev} = $self->{line};
8042     $self->{column_prev} = $self->{column};
8043     $self->{column}++;
8044     $self->{nc}
8045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046     } else {
8047     $self->{set_nc}->($self);
8048     }
8049    
8050     redo A;
8051     } elsif ($self->{nc} == 0x0028) { # (
8052     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8053     $self->{ct}->{content} = ['('];
8054     $self->{group_depth} = 1;
8055    
8056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057     $self->{line_prev} = $self->{line};
8058     $self->{column_prev} = $self->{column};
8059     $self->{column}++;
8060     $self->{nc}
8061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062     } else {
8063     $self->{set_nc}->($self);
8064     }
8065    
8066     redo A;
8067     } elsif ($self->{nc} == 0x003E) { # >
8068     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8070    
8071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8072     $self->{line_prev} = $self->{line};
8073     $self->{column_prev} = $self->{column};
8074     $self->{column}++;
8075     $self->{nc}
8076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8077     } else {
8078     $self->{set_nc}->($self);
8079     }
8080    
8081     return ($self->{ct}); # ELEMENT
8082     redo A;
8083     } elsif ($self->{nc} == -1) {
8084     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8085     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8086    
8087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8088     $self->{line_prev} = $self->{line};
8089     $self->{column_prev} = $self->{column};
8090     $self->{column}++;
8091     $self->{nc}
8092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8093     } else {
8094     $self->{set_nc}->($self);
8095     }
8096    
8097     return ($self->{ct}); # ELEMENT
8098     redo A;
8099     } else {
8100     $self->{ct}->{content} = [chr $self->{nc}];
8101     $self->{state} = CONTENT_KEYWORD_STATE;
8102    
8103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8104     $self->{line_prev} = $self->{line};
8105     $self->{column_prev} = $self->{column};
8106     $self->{column}++;
8107     $self->{nc}
8108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8109     } else {
8110     $self->{set_nc}->($self);
8111     }
8112    
8113     redo A;
8114     }
8115     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8116     if ($is_space->{$self->{nc}}) {
8117     $self->{state} = AFTER_MD_DEF_STATE;
8118    
8119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8120     $self->{line_prev} = $self->{line};
8121     $self->{column_prev} = $self->{column};
8122     $self->{column}++;
8123     $self->{nc}
8124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8125     } else {
8126     $self->{set_nc}->($self);
8127     }
8128    
8129     redo A;
8130     } elsif ($self->{nc} == 0x003E) { # >
8131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8132    
8133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8134     $self->{line_prev} = $self->{line};
8135     $self->{column_prev} = $self->{column};
8136     $self->{column}++;
8137     $self->{nc}
8138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8139     } else {
8140     $self->{set_nc}->($self);
8141     }
8142    
8143     return ($self->{ct}); # ELEMENT
8144     redo A;
8145     } elsif ($self->{nc} == -1) {
8146     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8147     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8148    
8149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8150     $self->{line_prev} = $self->{line};
8151     $self->{column_prev} = $self->{column};
8152     $self->{column}++;
8153     $self->{nc}
8154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8155     } else {
8156     $self->{set_nc}->($self);
8157     }
8158    
8159     return ($self->{ct}); # ELEMENT
8160     redo A;
8161     } else {
8162     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8163     ## Stay in the state.
8164    
8165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8166     $self->{line_prev} = $self->{line};
8167     $self->{column_prev} = $self->{column};
8168     $self->{column}++;
8169     $self->{nc}
8170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8171     } else {
8172     $self->{set_nc}->($self);
8173     }
8174    
8175     redo A;
8176     }
8177     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8178     if ($is_space->{$self->{nc}}) {
8179     ## Stay in the state.
8180    
8181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182     $self->{line_prev} = $self->{line};
8183     $self->{column_prev} = $self->{column};
8184     $self->{column}++;
8185     $self->{nc}
8186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187     } else {
8188     $self->{set_nc}->($self);
8189     }
8190    
8191     redo A;
8192     } elsif ($self->{nc} == 0x0028) { # (
8193     $self->{group_depth}++;
8194     push @{$self->{ct}->{content}}, chr $self->{nc};
8195     ## Stay in the state.
8196    
8197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198     $self->{line_prev} = $self->{line};
8199     $self->{column_prev} = $self->{column};
8200     $self->{column}++;
8201     $self->{nc}
8202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203     } else {
8204     $self->{set_nc}->($self);
8205     }
8206    
8207     redo A;
8208     } elsif ($self->{nc} == 0x007C or # |
8209     $self->{nc} == 0x002C) { # ,
8210     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8211     ## Stay in the state.
8212    
8213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8214     $self->{line_prev} = $self->{line};
8215     $self->{column_prev} = $self->{column};
8216     $self->{column}++;
8217     $self->{nc}
8218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8219     } else {
8220     $self->{set_nc}->($self);
8221     }
8222    
8223     redo A;
8224     } elsif ($self->{nc} == 0x0029) { # )
8225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8226     push @{$self->{ct}->{content}}, chr $self->{nc};
8227     $self->{group_depth}--;
8228     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8229    
8230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231     $self->{line_prev} = $self->{line};
8232     $self->{column_prev} = $self->{column};
8233     $self->{column}++;
8234     $self->{nc}
8235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236     } else {
8237     $self->{set_nc}->($self);
8238     }
8239    
8240     redo A;
8241     } elsif ($self->{nc} == 0x003E) { # >
8242     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8243     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8244     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245    
8246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247     $self->{line_prev} = $self->{line};
8248     $self->{column_prev} = $self->{column};
8249     $self->{column}++;
8250     $self->{nc}
8251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252     } else {
8253     $self->{set_nc}->($self);
8254     }
8255    
8256     return ($self->{ct}); # ELEMENT
8257     redo A;
8258     } elsif ($self->{nc} == -1) {
8259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8260     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8261     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8262    
8263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264     $self->{line_prev} = $self->{line};
8265     $self->{column_prev} = $self->{column};
8266     $self->{column}++;
8267     $self->{nc}
8268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269     } else {
8270     $self->{set_nc}->($self);
8271     }
8272    
8273     return ($self->{ct}); # ELEMENT
8274     redo A;
8275     } else {
8276     push @{$self->{ct}->{content}}, chr $self->{nc};
8277     $self->{state} = CM_ELEMENT_NAME_STATE;
8278    
8279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280     $self->{line_prev} = $self->{line};
8281     $self->{column_prev} = $self->{column};
8282     $self->{column}++;
8283     $self->{nc}
8284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285     } else {
8286     $self->{set_nc}->($self);
8287     }
8288    
8289     redo A;
8290     }
8291     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8292     if ($is_space->{$self->{nc}}) {
8293     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8294    
8295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296     $self->{line_prev} = $self->{line};
8297     $self->{column_prev} = $self->{column};
8298     $self->{column}++;
8299     $self->{nc}
8300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301     } else {
8302     $self->{set_nc}->($self);
8303     }
8304    
8305     redo A;
8306     } elsif ($self->{nc} == 0x002A or # *
8307     $self->{nc} == 0x002B or # +
8308     $self->{nc} == 0x003F) { # ?
8309     push @{$self->{ct}->{content}}, chr $self->{nc};
8310     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8311    
8312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8313     $self->{line_prev} = $self->{line};
8314     $self->{column_prev} = $self->{column};
8315     $self->{column}++;
8316     $self->{nc}
8317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8318     } else {
8319     $self->{set_nc}->($self);
8320     }
8321    
8322     redo A;
8323     } elsif ($self->{nc} == 0x007C or # |
8324     $self->{nc} == 0x002C) { # ,
8325     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8326     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8327    
8328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329     $self->{line_prev} = $self->{line};
8330     $self->{column_prev} = $self->{column};
8331     $self->{column}++;
8332     $self->{nc}
8333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334     } else {
8335     $self->{set_nc}->($self);
8336     }
8337    
8338     redo A;
8339     } elsif ($self->{nc} == 0x0029) { # )
8340     $self->{group_depth}--;
8341     push @{$self->{ct}->{content}}, chr $self->{nc};
8342     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8343    
8344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345     $self->{line_prev} = $self->{line};
8346     $self->{column_prev} = $self->{column};
8347     $self->{column}++;
8348     $self->{nc}
8349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350     } else {
8351     $self->{set_nc}->($self);
8352     }
8353    
8354     redo A;
8355     } elsif ($self->{nc} == 0x003E) { # >
8356     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8357     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8358     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8359    
8360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361     $self->{line_prev} = $self->{line};
8362     $self->{column_prev} = $self->{column};
8363     $self->{column}++;
8364     $self->{nc}
8365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366     } else {
8367     $self->{set_nc}->($self);
8368     }
8369    
8370     return ($self->{ct}); # ELEMENT
8371     redo A;
8372     } elsif ($self->{nc} == -1) {
8373     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8374     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8375     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8376    
8377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8378     $self->{line_prev} = $self->{line};
8379     $self->{column_prev} = $self->{column};
8380     $self->{column}++;
8381     $self->{nc}
8382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8383     } else {
8384     $self->{set_nc}->($self);
8385     }
8386    
8387     return ($self->{ct}); # ELEMENT
8388     redo A;
8389     } else {
8390     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8391     ## Stay in the state.
8392    
8393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8394     $self->{line_prev} = $self->{line};
8395     $self->{column_prev} = $self->{column};
8396     $self->{column}++;
8397     $self->{nc}
8398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8399     } else {
8400     $self->{set_nc}->($self);
8401     }
8402    
8403     redo A;
8404     }
8405     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8406     if ($is_space->{$self->{nc}}) {
8407     ## Stay in the state.
8408    
8409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8410     $self->{line_prev} = $self->{line};
8411     $self->{column_prev} = $self->{column};
8412     $self->{column}++;
8413     $self->{nc}
8414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8415     } else {
8416     $self->{set_nc}->($self);
8417     }
8418    
8419     redo A;
8420     } elsif ($self->{nc} == 0x007C or # |
8421     $self->{nc} == 0x002C) { # ,
8422     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8423     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8424    
8425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426     $self->{line_prev} = $self->{line};
8427     $self->{column_prev} = $self->{column};
8428     $self->{column}++;
8429     $self->{nc}
8430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431     } else {
8432     $self->{set_nc}->($self);
8433     }
8434    
8435     redo A;
8436     } elsif ($self->{nc} == 0x0029) { # )
8437     $self->{group_depth}--;
8438     push @{$self->{ct}->{content}}, chr $self->{nc};
8439     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8440    
8441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8442     $self->{line_prev} = $self->{line};
8443     $self->{column_prev} = $self->{column};
8444     $self->{column}++;
8445     $self->{nc}
8446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8447     } else {
8448     $self->{set_nc}->($self);
8449     }
8450    
8451     redo A;
8452     } elsif ($self->{nc} == 0x003E) { # >
8453     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8454     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8455     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8456    
8457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8458     $self->{line_prev} = $self->{line};
8459     $self->{column_prev} = $self->{column};
8460     $self->{column}++;
8461     $self->{nc}
8462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8463     } else {
8464     $self->{set_nc}->($self);
8465     }
8466    
8467     return ($self->{ct}); # ELEMENT
8468     redo A;
8469     } elsif ($self->{nc} == -1) {
8470     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8471     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8472     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8473    
8474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8475     $self->{line_prev} = $self->{line};
8476     $self->{column_prev} = $self->{column};
8477     $self->{column}++;
8478     $self->{nc}
8479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8480     } else {
8481     $self->{set_nc}->($self);
8482     }
8483    
8484     return ($self->{ct}); # ELEMENT
8485     redo A;
8486     } else {
8487     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8488     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8489     $self->{state} = BOGUS_MD_STATE;
8490    
8491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8492     $self->{line_prev} = $self->{line};
8493     $self->{column_prev} = $self->{column};
8494     $self->{column}++;
8495     $self->{nc}
8496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8497     } else {
8498     $self->{set_nc}->($self);
8499     }
8500    
8501     redo A;
8502     }
8503     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8504     if ($is_space->{$self->{nc}}) {
8505     if ($self->{group_depth}) {
8506     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8507     } else {
8508     $self->{state} = AFTER_MD_DEF_STATE;
8509     }
8510    
8511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8512     $self->{line_prev} = $self->{line};
8513     $self->{column_prev} = $self->{column};
8514     $self->{column}++;
8515     $self->{nc}
8516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8517     } else {
8518     $self->{set_nc}->($self);
8519     }
8520    
8521     redo A;
8522     } elsif ($self->{nc} == 0x002A or # *
8523     $self->{nc} == 0x002B or # +
8524     $self->{nc} == 0x003F) { # ?
8525     push @{$self->{ct}->{content}}, chr $self->{nc};
8526     if ($self->{group_depth}) {
8527     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8528     } else {
8529     $self->{state} = AFTER_MD_DEF_STATE;
8530     }
8531    
8532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8533     $self->{line_prev} = $self->{line};
8534     $self->{column_prev} = $self->{column};
8535     $self->{column}++;
8536     $self->{nc}
8537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8538     } else {
8539     $self->{set_nc}->($self);
8540     }
8541    
8542     redo A;
8543     } elsif ($self->{nc} == 0x0029) { # )
8544     if ($self->{group_depth}) {
8545     $self->{group_depth}--;
8546     push @{$self->{ct}->{content}}, chr $self->{nc};
8547     ## Stay in the state.
8548    
8549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550     $self->{line_prev} = $self->{line};
8551     $self->{column_prev} = $self->{column};
8552     $self->{column}++;
8553     $self->{nc}
8554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555     } else {
8556     $self->{set_nc}->($self);
8557     }
8558    
8559     redo A;
8560     } else {
8561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8562     $self->{state} = BOGUS_MD_STATE;
8563     ## Reconsume.
8564     redo A;
8565     }
8566     } elsif ($self->{nc} == 0x003E) { # >
8567     if ($self->{group_depth}) {
8568     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8569     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8570     }
8571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8572    
8573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8574     $self->{line_prev} = $self->{line};
8575     $self->{column_prev} = $self->{column};
8576     $self->{column}++;
8577     $self->{nc}
8578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8579     } else {
8580     $self->{set_nc}->($self);
8581     }
8582    
8583     return ($self->{ct}); # ELEMENT
8584     redo A;
8585     } elsif ($self->{nc} == -1) {
8586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8587     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8588     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8589    
8590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8591     $self->{line_prev} = $self->{line};
8592     $self->{column_prev} = $self->{column};
8593     $self->{column}++;
8594     $self->{nc}
8595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8596     } else {
8597     $self->{set_nc}->($self);
8598     }
8599    
8600     return ($self->{ct}); # ELEMENT
8601     redo A;
8602     } else {
8603     if ($self->{group_depth}) {
8604     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8605     } else {
8606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8607     $self->{state} = BOGUS_MD_STATE;
8608     }
8609     ## Reconsume.
8610     redo A;
8611     }
8612     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8613 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8614     ## Stay in the state.
8615    
8616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617     $self->{line_prev} = $self->{line};
8618     $self->{column_prev} = $self->{column};
8619     $self->{column}++;
8620     $self->{nc}
8621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622     } else {
8623     $self->{set_nc}->($self);
8624     }
8625    
8626     redo A;
8627     } elsif ($self->{nc} == 0x003E) { # >
8628     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8629    
8630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8631     $self->{line_prev} = $self->{line};
8632     $self->{column_prev} = $self->{column};
8633     $self->{column}++;
8634     $self->{nc}
8635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8636     } else {
8637     $self->{set_nc}->($self);
8638     }
8639    
8640 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8641 wakaba 1.18 redo A;
8642     } elsif ($self->{nc} == -1) {
8643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8644     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8645    
8646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8647     $self->{line_prev} = $self->{line};
8648     $self->{column_prev} = $self->{column};
8649     $self->{column}++;
8650     $self->{nc}
8651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8652     } else {
8653     $self->{set_nc}->($self);
8654     }
8655    
8656 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8657 wakaba 1.18 redo A;
8658     } else {
8659 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8660 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8661     ## Reconsume.
8662     redo A;
8663     }
8664 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8665     if ($self->{nc} == 0x003E) { # >
8666     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8667    
8668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8669     $self->{line_prev} = $self->{line};
8670     $self->{column_prev} = $self->{column};
8671     $self->{column}++;
8672     $self->{nc}
8673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8674     } else {
8675     $self->{set_nc}->($self);
8676     }
8677    
8678     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8679     redo A;
8680     } elsif ($self->{nc} == -1) {
8681     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8682     ## Reconsume.
8683     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8684     redo A;
8685     } else {
8686     ## Stay in the state.
8687    
8688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8689     $self->{line_prev} = $self->{line};
8690     $self->{column_prev} = $self->{column};
8691     $self->{column}++;
8692     $self->{nc}
8693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8694     } else {
8695     $self->{set_nc}->($self);
8696     }
8697    
8698     redo A;
8699     }
8700 wakaba 1.1 } else {
8701     die "$0: $self->{state}: Unknown state";
8702     }
8703     } # A
8704    
8705     die "$0: _get_next_token: unexpected case";
8706     } # _get_next_token
8707    
8708     1;
8709 wakaba 1.30 ## $Date: 2009/08/16 04:06:34 $
8710 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24