/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.26 - (hide annotations) (download)
Thu Jul 2 21:42:43 2009 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.25: +4 -3 lines
++ whatpm/t/ChangeLog	2 Jul 2009 21:41:03 -0000
2009-07-03  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: < in unquoted attribute value is no
	longer allowed (HTML5 revision 3206).

++ whatpm/Whatpm/HTML/ChangeLog	2 Jul 2009 21:42:34 -0000
2009-07-03  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: "<" in unquoted attribute values is now
	treated as parse error (HTML5 revision 3206).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.26 our $VERSION=do{my @r=(q$Revision: 1.25 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744 wakaba 1.1
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819 wakaba 1.25 } elsif ($self->{is_xml} and
1820     $is_space->{$self->{nc}}) {
1821    
1822     $self->{ca}->{value} .= ' ';
1823     ## Stay in the state.
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836 wakaba 1.1 } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 wakaba 1.15
1842     $self->{state} = DATA_STATE;
1843     $self->{s_kwd} = '';
1844     ## reconsume
1845     return ($self->{ct}); # start tag
1846     redo A;
1847 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849     if ($self->{ct}->{attributes}) {
1850    
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852     } else {
1853     ## NOTE: This state should never be reached.
1854    
1855     }
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # end tag
1861     redo A;
1862     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863     ## XML5: No parse error above; not defined yet.
1864     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866     ## Reconsume.
1867     return ($self->{ct}); # ATTLIST
1868     redo A;
1869 wakaba 1.1 } else {
1870     die "$0: $self->{ct}->{type}: Unknown token type";
1871     }
1872     } else {
1873 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1874 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875    
1876     ## XML5: Not a parse error.
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878     } else {
1879    
1880     }
1881 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1882     $self->{read_until}->($self->{ca}->{value},
1883 wakaba 1.25 qq["&<\x09\x0C\x20],
1884 wakaba 1.1 length $self->{ca}->{value});
1885    
1886     ## Stay in the state
1887    
1888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889     $self->{line_prev} = $self->{line};
1890     $self->{column_prev} = $self->{column};
1891     $self->{column}++;
1892     $self->{nc}
1893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894     } else {
1895     $self->{set_nc}->($self);
1896     }
1897    
1898     redo A;
1899     }
1900     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902     ## ATTLIST attribute value single quoted state".
1903 wakaba 1.11
1904 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1905 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906    
1907     ## XML5: "DOCTYPE ATTLIST name after state".
1908     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910     } else {
1911    
1912     ## XML5: "Before attribute name state" (sic).
1913     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914     }
1915 wakaba 1.1
1916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917     $self->{line_prev} = $self->{line};
1918     $self->{column_prev} = $self->{column};
1919     $self->{column}++;
1920     $self->{nc}
1921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922     } else {
1923     $self->{set_nc}->($self);
1924     }
1925    
1926     redo A;
1927     } elsif ($self->{nc} == 0x0026) { # &
1928    
1929 wakaba 1.11 ## XML5: Not defined yet.
1930    
1931 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1932     ## "entity in attribute value state". In this implementation, the
1933     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934     ## implementation of the "consume a character reference" algorithm.
1935     $self->{entity_add} = 0x0027; # '
1936     $self->{prev_state} = $self->{state};
1937     $self->{state} = ENTITY_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949     redo A;
1950 wakaba 1.25 } elsif ($self->{is_xml} and
1951     $is_space->{$self->{nc}}) {
1952    
1953     $self->{ca}->{value} .= ' ';
1954     ## Stay in the state.
1955    
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966     redo A;
1967 wakaba 1.1 } elsif ($self->{nc} == -1) {
1968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972 wakaba 1.15
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     ## reconsume
1976     return ($self->{ct}); # start tag
1977     redo A;
1978 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980     if ($self->{ct}->{attributes}) {
1981    
1982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983     } else {
1984     ## NOTE: This state should never be reached.
1985    
1986     }
1987 wakaba 1.15
1988     $self->{state} = DATA_STATE;
1989     $self->{s_kwd} = '';
1990     ## reconsume
1991     return ($self->{ct}); # end tag
1992     redo A;
1993     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994     ## XML5: No parse error above; not defined yet.
1995     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     ## Reconsume.
1998     return ($self->{ct}); # ATTLIST
1999     redo A;
2000 wakaba 1.1 } else {
2001     die "$0: $self->{ct}->{type}: Unknown token type";
2002     }
2003     } else {
2004 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2005 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006    
2007     ## XML5: Not a parse error.
2008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009     } else {
2010    
2011     }
2012 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2013     $self->{read_until}->($self->{ca}->{value},
2014 wakaba 1.25 qq['&<\x09\x0C\x20],
2015 wakaba 1.1 length $self->{ca}->{value});
2016    
2017     ## Stay in the state
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     }
2031     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2033    
2034 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2035 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036    
2037     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039     } else {
2040    
2041     ## XML5: "Tag attribute name before state".
2042     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043     }
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055     redo A;
2056     } elsif ($self->{nc} == 0x0026) { # &
2057    
2058 wakaba 1.11
2059     ## XML5: Not defined yet.
2060    
2061 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2062     ## "entity in attribute value state". In this implementation, the
2063     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064     ## implementation of the "consume a character reference" algorithm.
2065     $self->{entity_add} = -1;
2066     $self->{prev_state} = $self->{state};
2067     $self->{state} = ENTITY_STATE;
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x003E) { # >
2081     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082    
2083     $self->{last_stag_name} = $self->{ct}->{tag_name};
2084 wakaba 1.15
2085     $self->{state} = DATA_STATE;
2086     $self->{s_kwd} = '';
2087    
2088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098     return ($self->{ct}); # start tag
2099     redo A;
2100 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102     if ($self->{ct}->{attributes}) {
2103    
2104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105     } else {
2106     ## NOTE: This state should never be reached.
2107    
2108     }
2109 wakaba 1.15
2110     $self->{state} = DATA_STATE;
2111     $self->{s_kwd} = '';
2112    
2113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114     $self->{line_prev} = $self->{line};
2115     $self->{column_prev} = $self->{column};
2116     $self->{column}++;
2117     $self->{nc}
2118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119     } else {
2120     $self->{set_nc}->($self);
2121     }
2122    
2123     return ($self->{ct}); # end tag
2124     redo A;
2125     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128    
2129 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130     $self->{line_prev} = $self->{line};
2131     $self->{column_prev} = $self->{column};
2132     $self->{column}++;
2133     $self->{nc}
2134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135     } else {
2136     $self->{set_nc}->($self);
2137     }
2138    
2139 wakaba 1.15 return ($self->{ct}); # ATTLIST
2140     redo A;
2141     } else {
2142     die "$0: $self->{ct}->{type}: Unknown token type";
2143     }
2144 wakaba 1.1 } elsif ($self->{nc} == -1) {
2145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146    
2147 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149 wakaba 1.15
2150     $self->{state} = DATA_STATE;
2151     $self->{s_kwd} = '';
2152     ## reconsume
2153     return ($self->{ct}); # start tag
2154     redo A;
2155 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158     if ($self->{ct}->{attributes}) {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161     } else {
2162     ## NOTE: This state should never be reached.
2163    
2164     }
2165 wakaba 1.15
2166     $self->{state} = DATA_STATE;
2167     $self->{s_kwd} = '';
2168     ## reconsume
2169     return ($self->{ct}); # end tag
2170     redo A;
2171     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175     ## Reconsume.
2176     return ($self->{ct}); # ATTLIST
2177     redo A;
2178 wakaba 1.1 } else {
2179     die "$0: $self->{ct}->{type}: Unknown token type";
2180     }
2181     } else {
2182     if ({
2183     0x0022 => 1, # "
2184     0x0027 => 1, # '
2185     0x003D => 1, # =
2186 wakaba 1.26 0x003C => 1, # <
2187 wakaba 1.1 }->{$self->{nc}}) {
2188    
2189 wakaba 1.11 ## XML5: Not a parse error.
2190 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2191     } else {
2192    
2193     }
2194     $self->{ca}->{value} .= chr ($self->{nc});
2195     $self->{read_until}->($self->{ca}->{value},
2196 wakaba 1.25 qq["'=& \x09\x0C>],
2197 wakaba 1.1 length $self->{ca}->{value});
2198    
2199     ## Stay in the state
2200    
2201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2202     $self->{line_prev} = $self->{line};
2203     $self->{column_prev} = $self->{column};
2204     $self->{column}++;
2205     $self->{nc}
2206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2207     } else {
2208     $self->{set_nc}->($self);
2209     }
2210    
2211     redo A;
2212     }
2213     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2214     if ($is_space->{$self->{nc}}) {
2215    
2216     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2217    
2218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219     $self->{line_prev} = $self->{line};
2220     $self->{column_prev} = $self->{column};
2221     $self->{column}++;
2222     $self->{nc}
2223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224     } else {
2225     $self->{set_nc}->($self);
2226     }
2227    
2228     redo A;
2229     } elsif ($self->{nc} == 0x003E) { # >
2230     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2231    
2232     $self->{last_stag_name} = $self->{ct}->{tag_name};
2233     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2234     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2235     if ($self->{ct}->{attributes}) {
2236    
2237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2238     } else {
2239     ## NOTE: This state should never be reached.
2240    
2241     }
2242     } else {
2243     die "$0: $self->{ct}->{type}: Unknown token type";
2244     }
2245     $self->{state} = DATA_STATE;
2246 wakaba 1.5 $self->{s_kwd} = '';
2247 wakaba 1.1
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258    
2259     return ($self->{ct}); # start tag or end tag
2260    
2261     redo A;
2262     } elsif ($self->{nc} == 0x002F) { # /
2263    
2264     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2265    
2266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2267     $self->{line_prev} = $self->{line};
2268     $self->{column_prev} = $self->{column};
2269     $self->{column}++;
2270     $self->{nc}
2271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2272     } else {
2273     $self->{set_nc}->($self);
2274     }
2275    
2276     redo A;
2277     } elsif ($self->{nc} == -1) {
2278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2279     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2280    
2281     $self->{last_stag_name} = $self->{ct}->{tag_name};
2282     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2283     if ($self->{ct}->{attributes}) {
2284    
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2286     } else {
2287     ## NOTE: This state should never be reached.
2288    
2289     }
2290     } else {
2291     die "$0: $self->{ct}->{type}: Unknown token type";
2292     }
2293     $self->{state} = DATA_STATE;
2294 wakaba 1.5 $self->{s_kwd} = '';
2295 wakaba 1.1 ## Reconsume.
2296     return ($self->{ct}); # start tag or end tag
2297     redo A;
2298     } else {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2301     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2302     ## reconsume
2303     redo A;
2304     }
2305     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2306 wakaba 1.11 ## XML5: "Empty tag state".
2307    
2308 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2309     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2312     ## TODO: Different type than slash in start tag
2313     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318    
2319     }
2320     ## TODO: Test |<title></title/>|
2321     } else {
2322    
2323     $self->{self_closing} = 1;
2324     }
2325    
2326     $self->{state} = DATA_STATE;
2327 wakaba 1.5 $self->{s_kwd} = '';
2328 wakaba 1.1
2329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2330     $self->{line_prev} = $self->{line};
2331     $self->{column_prev} = $self->{column};
2332     $self->{column}++;
2333     $self->{nc}
2334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2335     } else {
2336     $self->{set_nc}->($self);
2337     }
2338    
2339    
2340     return ($self->{ct}); # start tag or end tag
2341    
2342     redo A;
2343     } elsif ($self->{nc} == -1) {
2344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2345     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2346    
2347     $self->{last_stag_name} = $self->{ct}->{tag_name};
2348     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2349     if ($self->{ct}->{attributes}) {
2350    
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2352     } else {
2353     ## NOTE: This state should never be reached.
2354    
2355     }
2356     } else {
2357     die "$0: $self->{ct}->{type}: Unknown token type";
2358     }
2359 wakaba 1.11 ## XML5: "Tag attribute name before state".
2360 wakaba 1.1 $self->{state} = DATA_STATE;
2361 wakaba 1.5 $self->{s_kwd} = '';
2362 wakaba 1.1 ## Reconsume.
2363     return ($self->{ct}); # start tag or end tag
2364     redo A;
2365     } else {
2366    
2367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2368     ## TODO: This error type is wrong.
2369     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2370     ## Reconsume.
2371     redo A;
2372     }
2373     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2374 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2375    
2376 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2377     ## consumes characters one-by-one basis.
2378    
2379     if ($self->{nc} == 0x003E) { # >
2380 wakaba 1.13 if ($self->{in_subset}) {
2381    
2382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2383     } else {
2384    
2385     $self->{state} = DATA_STATE;
2386     $self->{s_kwd} = '';
2387     }
2388 wakaba 1.1
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399    
2400     return ($self->{ct}); # comment
2401     redo A;
2402     } elsif ($self->{nc} == -1) {
2403 wakaba 1.13 if ($self->{in_subset}) {
2404    
2405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2406     } else {
2407    
2408     $self->{state} = DATA_STATE;
2409     $self->{s_kwd} = '';
2410     }
2411 wakaba 1.1 ## reconsume
2412    
2413     return ($self->{ct}); # comment
2414     redo A;
2415     } else {
2416    
2417     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2418     $self->{read_until}->($self->{ct}->{data},
2419     q[>],
2420     length $self->{ct}->{data});
2421    
2422     ## Stay in the state.
2423    
2424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425     $self->{line_prev} = $self->{line};
2426     $self->{column_prev} = $self->{column};
2427     $self->{column}++;
2428     $self->{nc}
2429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430     } else {
2431     $self->{set_nc}->($self);
2432     }
2433    
2434     redo A;
2435     }
2436     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2437 wakaba 1.14 ## XML5: "Markup declaration state".
2438 wakaba 1.1
2439     if ($self->{nc} == 0x002D) { # -
2440    
2441     $self->{state} = MD_HYPHEN_STATE;
2442    
2443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2444     $self->{line_prev} = $self->{line};
2445     $self->{column_prev} = $self->{column};
2446     $self->{column}++;
2447     $self->{nc}
2448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2449     } else {
2450     $self->{set_nc}->($self);
2451     }
2452    
2453     redo A;
2454     } elsif ($self->{nc} == 0x0044 or # D
2455     $self->{nc} == 0x0064) { # d
2456     ## ASCII case-insensitive.
2457    
2458     $self->{state} = MD_DOCTYPE_STATE;
2459 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2460 wakaba 1.1
2461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2462     $self->{line_prev} = $self->{line};
2463     $self->{column_prev} = $self->{column};
2464     $self->{column}++;
2465     $self->{nc}
2466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2467     } else {
2468     $self->{set_nc}->($self);
2469     }
2470    
2471     redo A;
2472 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2473     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2474     $self->{is_xml}) and
2475 wakaba 1.1 $self->{nc} == 0x005B) { # [
2476    
2477     $self->{state} = MD_CDATA_STATE;
2478 wakaba 1.12 $self->{kwd} = '[';
2479 wakaba 1.1
2480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481     $self->{line_prev} = $self->{line};
2482     $self->{column_prev} = $self->{column};
2483     $self->{column}++;
2484     $self->{nc}
2485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486     } else {
2487     $self->{set_nc}->($self);
2488     }
2489    
2490     redo A;
2491     } else {
2492    
2493     }
2494    
2495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2496     line => $self->{line_prev},
2497     column => $self->{column_prev} - 1);
2498     ## Reconsume.
2499     $self->{state} = BOGUS_COMMENT_STATE;
2500     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2501     line => $self->{line_prev},
2502     column => $self->{column_prev} - 1,
2503     };
2504     redo A;
2505     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2506     if ($self->{nc} == 0x002D) { # -
2507    
2508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509     line => $self->{line_prev},
2510     column => $self->{column_prev} - 2,
2511     };
2512 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2513 wakaba 1.1
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } else {
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2528     line => $self->{line_prev},
2529     column => $self->{column_prev} - 2);
2530     $self->{state} = BOGUS_COMMENT_STATE;
2531     ## Reconsume.
2532     $self->{ct} = {type => COMMENT_TOKEN,
2533     data => '-',
2534     line => $self->{line_prev},
2535     column => $self->{column_prev} - 2,
2536     };
2537     redo A;
2538     }
2539     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2540     ## ASCII case-insensitive.
2541     if ($self->{nc} == [
2542     undef,
2543     0x004F, # O
2544     0x0043, # C
2545     0x0054, # T
2546     0x0059, # Y
2547     0x0050, # P
2548 wakaba 1.12 ]->[length $self->{kwd}] or
2549 wakaba 1.1 $self->{nc} == [
2550     undef,
2551     0x006F, # o
2552     0x0063, # c
2553     0x0074, # t
2554     0x0079, # y
2555     0x0070, # p
2556 wakaba 1.12 ]->[length $self->{kwd}]) {
2557 wakaba 1.1
2558     ## Stay in the state.
2559 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2560 wakaba 1.1
2561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562     $self->{line_prev} = $self->{line};
2563     $self->{column_prev} = $self->{column};
2564     $self->{column}++;
2565     $self->{nc}
2566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567     } else {
2568     $self->{set_nc}->($self);
2569     }
2570    
2571     redo A;
2572 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2573 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2574     $self->{nc} == 0x0065)) { # e
2575 wakaba 1.12 if ($self->{is_xml} and
2576     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2577 wakaba 1.10
2578     ## XML5: case-sensitive.
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2580     text => 'DOCTYPE',
2581     line => $self->{line_prev},
2582     column => $self->{column_prev} - 5);
2583     } else {
2584    
2585     }
2586 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2587     $self->{ct} = {type => DOCTYPE_TOKEN,
2588     quirks => 1,
2589     line => $self->{line_prev},
2590     column => $self->{column_prev} - 7,
2591     };
2592    
2593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2594     $self->{line_prev} = $self->{line};
2595     $self->{column_prev} = $self->{column};
2596     $self->{column}++;
2597     $self->{nc}
2598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2599     } else {
2600     $self->{set_nc}->($self);
2601     }
2602    
2603     redo A;
2604     } else {
2605    
2606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2607     line => $self->{line_prev},
2608 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2609 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2610     ## Reconsume.
2611     $self->{ct} = {type => COMMENT_TOKEN,
2612 wakaba 1.12 data => $self->{kwd},
2613 wakaba 1.1 line => $self->{line_prev},
2614 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2615 wakaba 1.1 };
2616     redo A;
2617     }
2618     } elsif ($self->{state} == MD_CDATA_STATE) {
2619     if ($self->{nc} == {
2620     '[' => 0x0043, # C
2621     '[C' => 0x0044, # D
2622     '[CD' => 0x0041, # A
2623     '[CDA' => 0x0054, # T
2624     '[CDAT' => 0x0041, # A
2625 wakaba 1.12 }->{$self->{kwd}}) {
2626 wakaba 1.1
2627     ## Stay in the state.
2628 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2629 wakaba 1.1
2630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631     $self->{line_prev} = $self->{line};
2632     $self->{column_prev} = $self->{column};
2633     $self->{column}++;
2634     $self->{nc}
2635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2636     } else {
2637     $self->{set_nc}->($self);
2638     }
2639    
2640     redo A;
2641 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2642 wakaba 1.1 $self->{nc} == 0x005B) { # [
2643 wakaba 1.6 if ($self->{is_xml} and
2644     not $self->{tainted} and
2645     @{$self->{open_elements} or []} == 0) {
2646 wakaba 1.8
2647 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2648     line => $self->{line_prev},
2649     column => $self->{column_prev} - 7);
2650     $self->{tainted} = 1;
2651 wakaba 1.8 } else {
2652    
2653 wakaba 1.6 }
2654    
2655 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2656     data => '',
2657     line => $self->{line_prev},
2658     column => $self->{column_prev} - 7};
2659     $self->{state} = CDATA_SECTION_STATE;
2660    
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671     redo A;
2672     } else {
2673    
2674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2675     line => $self->{line_prev},
2676 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2677 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2678     ## Reconsume.
2679     $self->{ct} = {type => COMMENT_TOKEN,
2680 wakaba 1.12 data => $self->{kwd},
2681 wakaba 1.1 line => $self->{line_prev},
2682 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2683 wakaba 1.1 };
2684     redo A;
2685     }
2686     } elsif ($self->{state} == COMMENT_START_STATE) {
2687     if ($self->{nc} == 0x002D) { # -
2688    
2689     $self->{state} = COMMENT_START_DASH_STATE;
2690    
2691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2692     $self->{line_prev} = $self->{line};
2693     $self->{column_prev} = $self->{column};
2694     $self->{column}++;
2695     $self->{nc}
2696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2697     } else {
2698     $self->{set_nc}->($self);
2699     }
2700    
2701     redo A;
2702     } elsif ($self->{nc} == 0x003E) { # >
2703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2704 wakaba 1.13 if ($self->{in_subset}) {
2705    
2706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707     } else {
2708    
2709     $self->{state} = DATA_STATE;
2710     $self->{s_kwd} = '';
2711     }
2712 wakaba 1.1
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723    
2724     return ($self->{ct}); # comment
2725    
2726     redo A;
2727     } elsif ($self->{nc} == -1) {
2728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729 wakaba 1.13 if ($self->{in_subset}) {
2730    
2731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732     } else {
2733    
2734     $self->{state} = DATA_STATE;
2735     $self->{s_kwd} = '';
2736     }
2737 wakaba 1.1 ## reconsume
2738    
2739     return ($self->{ct}); # comment
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{ct}->{data} # comment
2745     .= chr ($self->{nc});
2746     $self->{state} = COMMENT_STATE;
2747    
2748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749     $self->{line_prev} = $self->{line};
2750     $self->{column_prev} = $self->{column};
2751     $self->{column}++;
2752     $self->{nc}
2753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754     } else {
2755     $self->{set_nc}->($self);
2756     }
2757    
2758     redo A;
2759     }
2760     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2761     if ($self->{nc} == 0x002D) { # -
2762    
2763     $self->{state} = COMMENT_END_STATE;
2764    
2765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2766     $self->{line_prev} = $self->{line};
2767     $self->{column_prev} = $self->{column};
2768     $self->{column}++;
2769     $self->{nc}
2770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2771     } else {
2772     $self->{set_nc}->($self);
2773     }
2774    
2775     redo A;
2776     } elsif ($self->{nc} == 0x003E) { # >
2777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2778 wakaba 1.13 if ($self->{in_subset}) {
2779    
2780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781     } else {
2782    
2783     $self->{state} = DATA_STATE;
2784     $self->{s_kwd} = '';
2785     }
2786 wakaba 1.1
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797    
2798     return ($self->{ct}); # comment
2799    
2800     redo A;
2801     } elsif ($self->{nc} == -1) {
2802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 wakaba 1.13 if ($self->{in_subset}) {
2804    
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     } else {
2807    
2808     $self->{state} = DATA_STATE;
2809     $self->{s_kwd} = '';
2810     }
2811 wakaba 1.1 ## reconsume
2812    
2813     return ($self->{ct}); # comment
2814    
2815     redo A;
2816     } else {
2817    
2818     $self->{ct}->{data} # comment
2819     .= '-' . chr ($self->{nc});
2820     $self->{state} = COMMENT_STATE;
2821    
2822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2823     $self->{line_prev} = $self->{line};
2824     $self->{column_prev} = $self->{column};
2825     $self->{column}++;
2826     $self->{nc}
2827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2828     } else {
2829     $self->{set_nc}->($self);
2830     }
2831    
2832     redo A;
2833     }
2834     } elsif ($self->{state} == COMMENT_STATE) {
2835 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2836    
2837 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2838    
2839     $self->{state} = COMMENT_END_DASH_STATE;
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     } elsif ($self->{nc} == -1) {
2853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2854 wakaba 1.13 if ($self->{in_subset}) {
2855    
2856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857     } else {
2858    
2859     $self->{state} = DATA_STATE;
2860     $self->{s_kwd} = '';
2861     }
2862 wakaba 1.1 ## reconsume
2863    
2864     return ($self->{ct}); # comment
2865    
2866     redo A;
2867     } else {
2868    
2869     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2870     $self->{read_until}->($self->{ct}->{data},
2871     q[-],
2872     length $self->{ct}->{data});
2873    
2874     ## Stay in the state
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     }
2888     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2889 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2890 wakaba 1.10
2891 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2892    
2893     $self->{state} = COMMENT_END_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == -1) {
2907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2908 wakaba 1.13 if ($self->{in_subset}) {
2909    
2910     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2911     } else {
2912    
2913     $self->{state} = DATA_STATE;
2914     $self->{s_kwd} = '';
2915     }
2916 wakaba 1.1 ## reconsume
2917    
2918     return ($self->{ct}); # comment
2919    
2920     redo A;
2921     } else {
2922    
2923     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2924     $self->{state} = COMMENT_STATE;
2925    
2926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2927     $self->{line_prev} = $self->{line};
2928     $self->{column_prev} = $self->{column};
2929     $self->{column}++;
2930     $self->{nc}
2931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2932     } else {
2933     $self->{set_nc}->($self);
2934     }
2935    
2936     redo A;
2937     }
2938     } elsif ($self->{state} == COMMENT_END_STATE) {
2939 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2940    
2941 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2942 wakaba 1.13 if ($self->{in_subset}) {
2943    
2944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945     } else {
2946    
2947     $self->{state} = DATA_STATE;
2948     $self->{s_kwd} = '';
2949     }
2950 wakaba 1.1
2951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952     $self->{line_prev} = $self->{line};
2953     $self->{column_prev} = $self->{column};
2954     $self->{column}++;
2955     $self->{nc}
2956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957     } else {
2958     $self->{set_nc}->($self);
2959     }
2960    
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } elsif ($self->{nc} == 0x002D) { # -
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '-'; # comment
2972     ## Stay in the state
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     } elsif ($self->{nc} == -1) {
2986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2987 wakaba 1.13 if ($self->{in_subset}) {
2988    
2989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2990     } else {
2991    
2992     $self->{state} = DATA_STATE;
2993     $self->{s_kwd} = '';
2994     }
2995 wakaba 1.1 ## reconsume
2996    
2997     return ($self->{ct}); # comment
2998    
2999     redo A;
3000     } else {
3001    
3002 wakaba 1.10 ## XML5: Not a parse error.
3003 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3004     line => $self->{line_prev},
3005     column => $self->{column_prev});
3006     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3007     $self->{state} = COMMENT_STATE;
3008    
3009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010     $self->{line_prev} = $self->{line};
3011     $self->{column_prev} = $self->{column};
3012     $self->{column}++;
3013     $self->{nc}
3014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3015     } else {
3016     $self->{set_nc}->($self);
3017     }
3018    
3019     redo A;
3020     }
3021     } elsif ($self->{state} == DOCTYPE_STATE) {
3022     if ($is_space->{$self->{nc}}) {
3023    
3024     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3025    
3026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027     $self->{line_prev} = $self->{line};
3028     $self->{column_prev} = $self->{column};
3029     $self->{column}++;
3030     $self->{nc}
3031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032     } else {
3033     $self->{set_nc}->($self);
3034     }
3035    
3036     redo A;
3037     } else {
3038    
3039 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3040 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3041     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3042     ## reconsume
3043     redo A;
3044     }
3045     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3046 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3047    
3048 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3049    
3050     ## Stay in the state
3051    
3052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3053     $self->{line_prev} = $self->{line};
3054     $self->{column_prev} = $self->{column};
3055     $self->{column}++;
3056     $self->{nc}
3057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3058     } else {
3059     $self->{set_nc}->($self);
3060     }
3061    
3062     redo A;
3063     } elsif ($self->{nc} == 0x003E) { # >
3064    
3065 wakaba 1.12 ## XML5: No parse error.
3066 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3067     $self->{state} = DATA_STATE;
3068 wakaba 1.5 $self->{s_kwd} = '';
3069 wakaba 1.1
3070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3071     $self->{line_prev} = $self->{line};
3072     $self->{column_prev} = $self->{column};
3073     $self->{column}++;
3074     $self->{nc}
3075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3076     } else {
3077     $self->{set_nc}->($self);
3078     }
3079    
3080    
3081     return ($self->{ct}); # DOCTYPE (quirks)
3082    
3083     redo A;
3084     } elsif ($self->{nc} == -1) {
3085    
3086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3087     $self->{state} = DATA_STATE;
3088 wakaba 1.5 $self->{s_kwd} = '';
3089 wakaba 1.1 ## reconsume
3090    
3091     return ($self->{ct}); # DOCTYPE (quirks)
3092    
3093     redo A;
3094 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3095    
3096     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3097     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3098 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3099     $self->{in_subset} = 1;
3100 wakaba 1.12
3101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102     $self->{line_prev} = $self->{line};
3103     $self->{column_prev} = $self->{column};
3104     $self->{column}++;
3105     $self->{nc}
3106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107     } else {
3108     $self->{set_nc}->($self);
3109     }
3110    
3111 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3112 wakaba 1.12 redo A;
3113 wakaba 1.1 } else {
3114    
3115     $self->{ct}->{name} = chr $self->{nc};
3116     delete $self->{ct}->{quirks};
3117     $self->{state} = DOCTYPE_NAME_STATE;
3118    
3119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3120     $self->{line_prev} = $self->{line};
3121     $self->{column_prev} = $self->{column};
3122     $self->{column}++;
3123     $self->{nc}
3124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3125     } else {
3126     $self->{set_nc}->($self);
3127     }
3128    
3129     redo A;
3130     }
3131     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3132 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3133    
3134     ## ISSUE: Redundant "First," in the spec.
3135    
3136 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3137    
3138     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3139    
3140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141     $self->{line_prev} = $self->{line};
3142     $self->{column_prev} = $self->{column};
3143     $self->{column}++;
3144     $self->{nc}
3145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146     } else {
3147     $self->{set_nc}->($self);
3148     }
3149    
3150     redo A;
3151     } elsif ($self->{nc} == 0x003E) { # >
3152    
3153     $self->{state} = DATA_STATE;
3154 wakaba 1.5 $self->{s_kwd} = '';
3155 wakaba 1.1
3156     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3157     $self->{line_prev} = $self->{line};
3158     $self->{column_prev} = $self->{column};
3159     $self->{column}++;
3160     $self->{nc}
3161     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3162     } else {
3163     $self->{set_nc}->($self);
3164     }
3165    
3166    
3167     return ($self->{ct}); # DOCTYPE
3168    
3169     redo A;
3170     } elsif ($self->{nc} == -1) {
3171    
3172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3173     $self->{state} = DATA_STATE;
3174 wakaba 1.5 $self->{s_kwd} = '';
3175 wakaba 1.1 ## reconsume
3176    
3177     $self->{ct}->{quirks} = 1;
3178     return ($self->{ct}); # DOCTYPE
3179    
3180     redo A;
3181 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3182    
3183     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3184 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3185     $self->{in_subset} = 1;
3186 wakaba 1.12
3187     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188     $self->{line_prev} = $self->{line};
3189     $self->{column_prev} = $self->{column};
3190     $self->{column}++;
3191     $self->{nc}
3192     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193     } else {
3194     $self->{set_nc}->($self);
3195     }
3196    
3197 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3198 wakaba 1.12 redo A;
3199 wakaba 1.1 } else {
3200    
3201     $self->{ct}->{name}
3202     .= chr ($self->{nc}); # DOCTYPE
3203     ## Stay in the state
3204    
3205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3206     $self->{line_prev} = $self->{line};
3207     $self->{column_prev} = $self->{column};
3208     $self->{column}++;
3209     $self->{nc}
3210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3211     } else {
3212     $self->{set_nc}->($self);
3213     }
3214    
3215     redo A;
3216     }
3217     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3218 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3219     ## state", but implemented differently.
3220    
3221 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3222    
3223     ## Stay in the state
3224    
3225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3226     $self->{line_prev} = $self->{line};
3227     $self->{column_prev} = $self->{column};
3228     $self->{column}++;
3229     $self->{nc}
3230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3231     } else {
3232     $self->{set_nc}->($self);
3233     }
3234    
3235     redo A;
3236     } elsif ($self->{nc} == 0x003E) { # >
3237 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3238    
3239     $self->{state} = DATA_STATE;
3240     $self->{s_kwd} = '';
3241     } else {
3242    
3243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3244     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3245     }
3246 wakaba 1.1
3247    
3248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249     $self->{line_prev} = $self->{line};
3250     $self->{column_prev} = $self->{column};
3251     $self->{column}++;
3252     $self->{nc}
3253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3254     } else {
3255     $self->{set_nc}->($self);
3256     }
3257    
3258 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3259 wakaba 1.1 redo A;
3260     } elsif ($self->{nc} == -1) {
3261 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3262    
3263     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3264     $self->{state} = DATA_STATE;
3265     $self->{s_kwd} = '';
3266     $self->{ct}->{quirks} = 1;
3267     } else {
3268    
3269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271     }
3272 wakaba 1.1
3273 wakaba 1.16 ## Reconsume.
3274     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3275 wakaba 1.1 redo A;
3276     } elsif ($self->{nc} == 0x0050 or # P
3277     $self->{nc} == 0x0070) { # p
3278 wakaba 1.12
3279 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3280 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3281 wakaba 1.1
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     } elsif ($self->{nc} == 0x0053 or # S
3294     $self->{nc} == 0x0073) { # s
3295 wakaba 1.12
3296 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3297 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3298    
3299     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300     $self->{line_prev} = $self->{line};
3301     $self->{column_prev} = $self->{column};
3302     $self->{column}++;
3303     $self->{nc}
3304     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3305     } else {
3306     $self->{set_nc}->($self);
3307     }
3308    
3309     redo A;
3310 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3311     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3312     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3313    
3314     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3315     $self->{ct}->{value} = ''; # ENTITY
3316    
3317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3318     $self->{line_prev} = $self->{line};
3319     $self->{column_prev} = $self->{column};
3320     $self->{column}++;
3321     $self->{nc}
3322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3323     } else {
3324     $self->{set_nc}->($self);
3325     }
3326    
3327     redo A;
3328     } elsif ($self->{nc} == 0x0027 and # '
3329     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3330     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3331    
3332     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3333     $self->{ct}->{value} = ''; # ENTITY
3334    
3335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3336     $self->{line_prev} = $self->{line};
3337     $self->{column_prev} = $self->{column};
3338     $self->{column}++;
3339     $self->{nc}
3340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3341     } else {
3342     $self->{set_nc}->($self);
3343     }
3344    
3345     redo A;
3346 wakaba 1.16 } elsif ($self->{is_xml} and
3347     $self->{ct}->{type} == DOCTYPE_TOKEN and
3348     $self->{nc} == 0x005B) { # [
3349 wakaba 1.12
3350     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3351     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3352 wakaba 1.13 $self->{in_subset} = 1;
3353 wakaba 1.1
3354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3355     $self->{line_prev} = $self->{line};
3356     $self->{column_prev} = $self->{column};
3357     $self->{column}++;
3358     $self->{nc}
3359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3360     } else {
3361     $self->{set_nc}->($self);
3362     }
3363    
3364 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3365 wakaba 1.1 redo A;
3366     } else {
3367 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3368    
3369     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3370    
3371     $self->{ct}->{quirks} = 1;
3372     $self->{state} = BOGUS_DOCTYPE_STATE;
3373     } else {
3374    
3375     $self->{state} = BOGUS_MD_STATE;
3376     }
3377 wakaba 1.1
3378    
3379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3380     $self->{line_prev} = $self->{line};
3381     $self->{column_prev} = $self->{column};
3382     $self->{column}++;
3383     $self->{nc}
3384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3385     } else {
3386     $self->{set_nc}->($self);
3387     }
3388    
3389     redo A;
3390     }
3391     } elsif ($self->{state} == PUBLIC_STATE) {
3392     ## ASCII case-insensitive
3393     if ($self->{nc} == [
3394     undef,
3395     0x0055, # U
3396     0x0042, # B
3397     0x004C, # L
3398     0x0049, # I
3399 wakaba 1.12 ]->[length $self->{kwd}] or
3400 wakaba 1.1 $self->{nc} == [
3401     undef,
3402     0x0075, # u
3403     0x0062, # b
3404     0x006C, # l
3405     0x0069, # i
3406 wakaba 1.12 ]->[length $self->{kwd}]) {
3407 wakaba 1.1
3408     ## Stay in the state.
3409 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3410 wakaba 1.1
3411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3412     $self->{line_prev} = $self->{line};
3413     $self->{column_prev} = $self->{column};
3414     $self->{column}++;
3415     $self->{nc}
3416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3417     } else {
3418     $self->{set_nc}->($self);
3419     }
3420    
3421     redo A;
3422 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3423 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3424     $self->{nc} == 0x0063)) { # c
3425 wakaba 1.12 if ($self->{is_xml} and
3426     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3427    
3428     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3429     text => 'PUBLIC',
3430     line => $self->{line_prev},
3431     column => $self->{column_prev} - 4);
3432     } else {
3433    
3434     }
3435 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3436    
3437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3438     $self->{line_prev} = $self->{line};
3439     $self->{column_prev} = $self->{column};
3440     $self->{column}++;
3441     $self->{nc}
3442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3443     } else {
3444     $self->{set_nc}->($self);
3445     }
3446    
3447     redo A;
3448     } else {
3449 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3450 wakaba 1.1 line => $self->{line_prev},
3451 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3452 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3453    
3454     $self->{ct}->{quirks} = 1;
3455     $self->{state} = BOGUS_DOCTYPE_STATE;
3456     } else {
3457    
3458     $self->{state} = BOGUS_MD_STATE;
3459     }
3460 wakaba 1.1 ## Reconsume.
3461     redo A;
3462     }
3463     } elsif ($self->{state} == SYSTEM_STATE) {
3464     ## ASCII case-insensitive
3465     if ($self->{nc} == [
3466     undef,
3467     0x0059, # Y
3468     0x0053, # S
3469     0x0054, # T
3470     0x0045, # E
3471 wakaba 1.12 ]->[length $self->{kwd}] or
3472 wakaba 1.1 $self->{nc} == [
3473     undef,
3474     0x0079, # y
3475     0x0073, # s
3476     0x0074, # t
3477     0x0065, # e
3478 wakaba 1.12 ]->[length $self->{kwd}]) {
3479 wakaba 1.1
3480     ## Stay in the state.
3481 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3482 wakaba 1.1
3483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3484     $self->{line_prev} = $self->{line};
3485     $self->{column_prev} = $self->{column};
3486     $self->{column}++;
3487     $self->{nc}
3488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3489     } else {
3490     $self->{set_nc}->($self);
3491     }
3492    
3493     redo A;
3494 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3495 wakaba 1.1 ($self->{nc} == 0x004D or # M
3496     $self->{nc} == 0x006D)) { # m
3497 wakaba 1.12 if ($self->{is_xml} and
3498     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3499    
3500     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3501     text => 'SYSTEM',
3502     line => $self->{line_prev},
3503     column => $self->{column_prev} - 4);
3504     } else {
3505    
3506     }
3507 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3508    
3509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3510     $self->{line_prev} = $self->{line};
3511     $self->{column_prev} = $self->{column};
3512     $self->{column}++;
3513     $self->{nc}
3514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3515     } else {
3516     $self->{set_nc}->($self);
3517     }
3518    
3519     redo A;
3520     } else {
3521 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3522 wakaba 1.1 line => $self->{line_prev},
3523 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3524 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525    
3526     $self->{ct}->{quirks} = 1;
3527     $self->{state} = BOGUS_DOCTYPE_STATE;
3528     } else {
3529    
3530     $self->{state} = BOGUS_MD_STATE;
3531     }
3532 wakaba 1.1 ## Reconsume.
3533     redo A;
3534     }
3535     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3536     if ($is_space->{$self->{nc}}) {
3537    
3538     ## Stay in the state
3539    
3540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3541     $self->{line_prev} = $self->{line};
3542     $self->{column_prev} = $self->{column};
3543     $self->{column}++;
3544     $self->{nc}
3545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3546     } else {
3547     $self->{set_nc}->($self);
3548     }
3549    
3550     redo A;
3551     } elsif ($self->{nc} eq 0x0022) { # "
3552    
3553     $self->{ct}->{pubid} = ''; # DOCTYPE
3554     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3555    
3556     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3557     $self->{line_prev} = $self->{line};
3558     $self->{column_prev} = $self->{column};
3559     $self->{column}++;
3560     $self->{nc}
3561     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3562     } else {
3563     $self->{set_nc}->($self);
3564     }
3565    
3566     redo A;
3567     } elsif ($self->{nc} eq 0x0027) { # '
3568    
3569     $self->{ct}->{pubid} = ''; # DOCTYPE
3570     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3571    
3572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3573     $self->{line_prev} = $self->{line};
3574     $self->{column_prev} = $self->{column};
3575     $self->{column}++;
3576     $self->{nc}
3577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3578     } else {
3579     $self->{set_nc}->($self);
3580     }
3581    
3582     redo A;
3583     } elsif ($self->{nc} eq 0x003E) { # >
3584 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3585    
3586     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3587    
3588     $self->{state} = DATA_STATE;
3589     $self->{s_kwd} = '';
3590     $self->{ct}->{quirks} = 1;
3591     } else {
3592    
3593     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3594     }
3595 wakaba 1.1
3596    
3597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3598     $self->{line_prev} = $self->{line};
3599     $self->{column_prev} = $self->{column};
3600     $self->{column}++;
3601     $self->{nc}
3602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3603     } else {
3604     $self->{set_nc}->($self);
3605     }
3606    
3607 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3608 wakaba 1.1 redo A;
3609     } elsif ($self->{nc} == -1) {
3610 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611    
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3613     $self->{state} = DATA_STATE;
3614     $self->{s_kwd} = '';
3615     $self->{ct}->{quirks} = 1;
3616     } else {
3617    
3618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3620     }
3621 wakaba 1.1
3622     ## reconsume
3623     return ($self->{ct}); # DOCTYPE
3624     redo A;
3625 wakaba 1.16 } elsif ($self->{is_xml} and
3626     $self->{ct}->{type} == DOCTYPE_TOKEN and
3627     $self->{nc} == 0x005B) { # [
3628 wakaba 1.12
3629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3630     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3631     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3632 wakaba 1.13 $self->{in_subset} = 1;
3633 wakaba 1.12
3634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3635     $self->{line_prev} = $self->{line};
3636     $self->{column_prev} = $self->{column};
3637     $self->{column}++;
3638     $self->{nc}
3639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3640     } else {
3641     $self->{set_nc}->($self);
3642     }
3643    
3644 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3645 wakaba 1.12 redo A;
3646 wakaba 1.1 } else {
3647     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3648    
3649 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3650    
3651     $self->{ct}->{quirks} = 1;
3652     $self->{state} = BOGUS_DOCTYPE_STATE;
3653     } else {
3654    
3655     $self->{state} = BOGUS_MD_STATE;
3656     }
3657    
3658 wakaba 1.1
3659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3660     $self->{line_prev} = $self->{line};
3661     $self->{column_prev} = $self->{column};
3662     $self->{column}++;
3663     $self->{nc}
3664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3665     } else {
3666     $self->{set_nc}->($self);
3667     }
3668    
3669     redo A;
3670     }
3671     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3672     if ($self->{nc} == 0x0022) { # "
3673    
3674     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3675    
3676     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3677     $self->{line_prev} = $self->{line};
3678     $self->{column_prev} = $self->{column};
3679     $self->{column}++;
3680     $self->{nc}
3681     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3682     } else {
3683     $self->{set_nc}->($self);
3684     }
3685    
3686     redo A;
3687     } elsif ($self->{nc} == 0x003E) { # >
3688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3689    
3690 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3691    
3692     $self->{state} = DATA_STATE;
3693     $self->{s_kwd} = '';
3694     $self->{ct}->{quirks} = 1;
3695     } else {
3696    
3697     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3698     }
3699    
3700 wakaba 1.1
3701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3702     $self->{line_prev} = $self->{line};
3703     $self->{column_prev} = $self->{column};
3704     $self->{column}++;
3705     $self->{nc}
3706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3707     } else {
3708     $self->{set_nc}->($self);
3709     }
3710    
3711 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3712 wakaba 1.1 redo A;
3713     } elsif ($self->{nc} == -1) {
3714     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3715    
3716 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3717    
3718     $self->{state} = DATA_STATE;
3719     $self->{s_kwd} = '';
3720     $self->{ct}->{quirks} = 1;
3721     } else {
3722    
3723     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3724     }
3725    
3726     ## Reconsume.
3727 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3728     redo A;
3729     } else {
3730    
3731 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3732 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3733     length $self->{ct}->{pubid});
3734    
3735     ## Stay in the state
3736    
3737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3738     $self->{line_prev} = $self->{line};
3739     $self->{column_prev} = $self->{column};
3740     $self->{column}++;
3741     $self->{nc}
3742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3743     } else {
3744     $self->{set_nc}->($self);
3745     }
3746    
3747     redo A;
3748     }
3749     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3750     if ($self->{nc} == 0x0027) { # '
3751    
3752     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3753    
3754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3755     $self->{line_prev} = $self->{line};
3756     $self->{column_prev} = $self->{column};
3757     $self->{column}++;
3758     $self->{nc}
3759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3760     } else {
3761     $self->{set_nc}->($self);
3762     }
3763    
3764     redo A;
3765     } elsif ($self->{nc} == 0x003E) { # >
3766     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3767    
3768 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3769    
3770     $self->{state} = DATA_STATE;
3771     $self->{s_kwd} = '';
3772     $self->{ct}->{quirks} = 1;
3773     } else {
3774    
3775     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3776     }
3777    
3778 wakaba 1.1
3779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3780     $self->{line_prev} = $self->{line};
3781     $self->{column_prev} = $self->{column};
3782     $self->{column}++;
3783     $self->{nc}
3784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3785     } else {
3786     $self->{set_nc}->($self);
3787     }
3788    
3789 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3790 wakaba 1.1 redo A;
3791     } elsif ($self->{nc} == -1) {
3792     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3793    
3794 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3795    
3796     $self->{state} = DATA_STATE;
3797     $self->{s_kwd} = '';
3798     $self->{ct}->{quirks} = 1;
3799     } else {
3800    
3801     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3802     }
3803    
3804 wakaba 1.1 ## reconsume
3805 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3806 wakaba 1.1 redo A;
3807     } else {
3808    
3809 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3810 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3811     length $self->{ct}->{pubid});
3812    
3813     ## Stay in the state
3814    
3815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3816     $self->{line_prev} = $self->{line};
3817     $self->{column_prev} = $self->{column};
3818     $self->{column}++;
3819     $self->{nc}
3820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3821     } else {
3822     $self->{set_nc}->($self);
3823     }
3824    
3825     redo A;
3826     }
3827     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3828     if ($is_space->{$self->{nc}}) {
3829    
3830     ## Stay in the state
3831    
3832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3833     $self->{line_prev} = $self->{line};
3834     $self->{column_prev} = $self->{column};
3835     $self->{column}++;
3836     $self->{nc}
3837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3838     } else {
3839     $self->{set_nc}->($self);
3840     }
3841    
3842     redo A;
3843     } elsif ($self->{nc} == 0x0022) { # "
3844    
3845 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3846 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3847    
3848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849     $self->{line_prev} = $self->{line};
3850     $self->{column_prev} = $self->{column};
3851     $self->{column}++;
3852     $self->{nc}
3853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854     } else {
3855     $self->{set_nc}->($self);
3856     }
3857    
3858     redo A;
3859     } elsif ($self->{nc} == 0x0027) { # '
3860    
3861 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3862 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3863    
3864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3865     $self->{line_prev} = $self->{line};
3866     $self->{column_prev} = $self->{column};
3867     $self->{column}++;
3868     $self->{nc}
3869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3870     } else {
3871     $self->{set_nc}->($self);
3872     }
3873    
3874     redo A;
3875     } elsif ($self->{nc} == 0x003E) { # >
3876 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3877     if ($self->{is_xml}) {
3878    
3879     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3880     } else {
3881    
3882     }
3883     $self->{state} = DATA_STATE;
3884     $self->{s_kwd} = '';
3885 wakaba 1.12 } else {
3886 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3887    
3888     } else {
3889    
3890     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3891     }
3892     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3893 wakaba 1.12 }
3894 wakaba 1.16
3895 wakaba 1.1
3896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3897     $self->{line_prev} = $self->{line};
3898     $self->{column_prev} = $self->{column};
3899     $self->{column}++;
3900     $self->{nc}
3901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3902     } else {
3903     $self->{set_nc}->($self);
3904     }
3905    
3906 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3907 wakaba 1.1 redo A;
3908     } elsif ($self->{nc} == -1) {
3909 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3910    
3911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3912    
3913     $self->{state} = DATA_STATE;
3914     $self->{s_kwd} = '';
3915     $self->{ct}->{quirks} = 1;
3916     } else {
3917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3919     }
3920 wakaba 1.1
3921     ## reconsume
3922 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3923 wakaba 1.1 redo A;
3924 wakaba 1.16 } elsif ($self->{is_xml} and
3925     $self->{ct}->{type} == DOCTYPE_TOKEN and
3926     $self->{nc} == 0x005B) { # [
3927 wakaba 1.12
3928     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3929     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3930     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3931 wakaba 1.13 $self->{in_subset} = 1;
3932 wakaba 1.12
3933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3934     $self->{line_prev} = $self->{line};
3935     $self->{column_prev} = $self->{column};
3936     $self->{column}++;
3937     $self->{nc}
3938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3939     } else {
3940     $self->{set_nc}->($self);
3941     }
3942    
3943 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3944 wakaba 1.12 redo A;
3945 wakaba 1.1 } else {
3946     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3947    
3948 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3949    
3950     $self->{ct}->{quirks} = 1;
3951     $self->{state} = BOGUS_DOCTYPE_STATE;
3952     } else {
3953    
3954     $self->{state} = BOGUS_MD_STATE;
3955     }
3956    
3957 wakaba 1.1
3958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3959     $self->{line_prev} = $self->{line};
3960     $self->{column_prev} = $self->{column};
3961     $self->{column}++;
3962     $self->{nc}
3963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3964     } else {
3965     $self->{set_nc}->($self);
3966     }
3967    
3968     redo A;
3969     }
3970     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3971     if ($is_space->{$self->{nc}}) {
3972    
3973     ## Stay in the state
3974    
3975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3976     $self->{line_prev} = $self->{line};
3977     $self->{column_prev} = $self->{column};
3978     $self->{column}++;
3979     $self->{nc}
3980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3981     } else {
3982     $self->{set_nc}->($self);
3983     }
3984    
3985     redo A;
3986     } elsif ($self->{nc} == 0x0022) { # "
3987    
3988     $self->{ct}->{sysid} = ''; # DOCTYPE
3989     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3990    
3991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3992     $self->{line_prev} = $self->{line};
3993     $self->{column_prev} = $self->{column};
3994     $self->{column}++;
3995     $self->{nc}
3996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3997     } else {
3998     $self->{set_nc}->($self);
3999     }
4000    
4001     redo A;
4002     } elsif ($self->{nc} == 0x0027) { # '
4003    
4004     $self->{ct}->{sysid} = ''; # DOCTYPE
4005     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4006    
4007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4008     $self->{line_prev} = $self->{line};
4009     $self->{column_prev} = $self->{column};
4010     $self->{column}++;
4011     $self->{nc}
4012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4013     } else {
4014     $self->{set_nc}->($self);
4015     }
4016    
4017     redo A;
4018     } elsif ($self->{nc} == 0x003E) { # >
4019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4020    
4021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4022     $self->{line_prev} = $self->{line};
4023     $self->{column_prev} = $self->{column};
4024     $self->{column}++;
4025     $self->{nc}
4026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4027     } else {
4028     $self->{set_nc}->($self);
4029     }
4030    
4031    
4032 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4033    
4034     $self->{state} = DATA_STATE;
4035     $self->{s_kwd} = '';
4036     $self->{ct}->{quirks} = 1;
4037     } else {
4038    
4039     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4040     }
4041 wakaba 1.1
4042 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4043 wakaba 1.1 redo A;
4044     } elsif ($self->{nc} == -1) {
4045 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4046    
4047     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4048     $self->{state} = DATA_STATE;
4049     $self->{s_kwd} = '';
4050     $self->{ct}->{quirks} = 1;
4051     } else {
4052    
4053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4054     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4055     }
4056 wakaba 1.1
4057     ## reconsume
4058 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4059 wakaba 1.1 redo A;
4060 wakaba 1.16 } elsif ($self->{is_xml} and
4061     $self->{ct}->{type} == DOCTYPE_TOKEN and
4062     $self->{nc} == 0x005B) { # [
4063 wakaba 1.12
4064     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4065    
4066     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4067     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4068 wakaba 1.13 $self->{in_subset} = 1;
4069 wakaba 1.12
4070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4071     $self->{line_prev} = $self->{line};
4072     $self->{column_prev} = $self->{column};
4073     $self->{column}++;
4074     $self->{nc}
4075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4076     } else {
4077     $self->{set_nc}->($self);
4078     }
4079    
4080 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4081 wakaba 1.12 redo A;
4082 wakaba 1.1 } else {
4083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4084    
4085 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4086    
4087     $self->{ct}->{quirks} = 1;
4088     $self->{state} = BOGUS_DOCTYPE_STATE;
4089     } else {
4090    
4091     $self->{state} = BOGUS_MD_STATE;
4092     }
4093    
4094 wakaba 1.1
4095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4096     $self->{line_prev} = $self->{line};
4097     $self->{column_prev} = $self->{column};
4098     $self->{column}++;
4099     $self->{nc}
4100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4101     } else {
4102     $self->{set_nc}->($self);
4103     }
4104    
4105     redo A;
4106     }
4107     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4108     if ($self->{nc} == 0x0022) { # "
4109    
4110     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4111    
4112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4113     $self->{line_prev} = $self->{line};
4114     $self->{column_prev} = $self->{column};
4115     $self->{column}++;
4116     $self->{nc}
4117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4118     } else {
4119     $self->{set_nc}->($self);
4120     }
4121    
4122     redo A;
4123 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4124 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4125    
4126 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4127    
4128     $self->{state} = DATA_STATE;
4129     $self->{s_kwd} = '';
4130     $self->{ct}->{quirks} = 1;
4131     } else {
4132    
4133     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4134     }
4135    
4136 wakaba 1.1
4137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4138     $self->{line_prev} = $self->{line};
4139     $self->{column_prev} = $self->{column};
4140     $self->{column}++;
4141     $self->{nc}
4142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4143     } else {
4144     $self->{set_nc}->($self);
4145     }
4146    
4147 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4148 wakaba 1.1 redo A;
4149     } elsif ($self->{nc} == -1) {
4150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4151    
4152 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4153    
4154     $self->{state} = DATA_STATE;
4155     $self->{s_kwd} = '';
4156     $self->{ct}->{quirks} = 1;
4157     } else {
4158    
4159     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4160     }
4161    
4162 wakaba 1.1 ## reconsume
4163 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4164 wakaba 1.1 redo A;
4165     } else {
4166    
4167 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4168 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4169     length $self->{ct}->{sysid});
4170    
4171     ## Stay in the state
4172    
4173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174     $self->{line_prev} = $self->{line};
4175     $self->{column_prev} = $self->{column};
4176     $self->{column}++;
4177     $self->{nc}
4178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179     } else {
4180     $self->{set_nc}->($self);
4181     }
4182    
4183     redo A;
4184     }
4185     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4186     if ($self->{nc} == 0x0027) { # '
4187    
4188     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4189    
4190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4191     $self->{line_prev} = $self->{line};
4192     $self->{column_prev} = $self->{column};
4193     $self->{column}++;
4194     $self->{nc}
4195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4196     } else {
4197     $self->{set_nc}->($self);
4198     }
4199    
4200     redo A;
4201 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4202 wakaba 1.1
4203     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4204    
4205     $self->{state} = DATA_STATE;
4206 wakaba 1.5 $self->{s_kwd} = '';
4207 wakaba 1.1
4208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4209     $self->{line_prev} = $self->{line};
4210     $self->{column_prev} = $self->{column};
4211     $self->{column}++;
4212     $self->{nc}
4213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4214     } else {
4215     $self->{set_nc}->($self);
4216     }
4217    
4218    
4219     $self->{ct}->{quirks} = 1;
4220     return ($self->{ct}); # DOCTYPE
4221    
4222     redo A;
4223     } elsif ($self->{nc} == -1) {
4224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4225    
4226 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4227    
4228     $self->{state} = DATA_STATE;
4229     $self->{s_kwd} = '';
4230     $self->{ct}->{quirks} = 1;
4231     } else {
4232    
4233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234     }
4235    
4236 wakaba 1.1 ## reconsume
4237 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4238 wakaba 1.1 redo A;
4239     } else {
4240    
4241 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4242 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4243     length $self->{ct}->{sysid});
4244    
4245     ## Stay in the state
4246    
4247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4248     $self->{line_prev} = $self->{line};
4249     $self->{column_prev} = $self->{column};
4250     $self->{column}++;
4251     $self->{nc}
4252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4253     } else {
4254     $self->{set_nc}->($self);
4255     }
4256    
4257     redo A;
4258     }
4259     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4260     if ($is_space->{$self->{nc}}) {
4261 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4262    
4263     $self->{state} = BEFORE_NDATA_STATE;
4264     } else {
4265    
4266     ## Stay in the state
4267     }
4268 wakaba 1.1
4269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270     $self->{line_prev} = $self->{line};
4271     $self->{column_prev} = $self->{column};
4272     $self->{column}++;
4273     $self->{nc}
4274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4275     } else {
4276     $self->{set_nc}->($self);
4277     }
4278    
4279     redo A;
4280     } elsif ($self->{nc} == 0x003E) { # >
4281 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282    
4283     $self->{state} = DATA_STATE;
4284     $self->{s_kwd} = '';
4285     } else {
4286    
4287     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4288     }
4289    
4290 wakaba 1.1
4291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292     $self->{line_prev} = $self->{line};
4293     $self->{column_prev} = $self->{column};
4294     $self->{column}++;
4295     $self->{nc}
4296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297     } else {
4298     $self->{set_nc}->($self);
4299     }
4300    
4301 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 wakaba 1.1 redo A;
4303 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4304     ($self->{nc} == 0x004E or # N
4305     $self->{nc} == 0x006E)) { # n
4306    
4307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4308     $self->{state} = NDATA_STATE;
4309     $self->{kwd} = chr $self->{nc};
4310    
4311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312     $self->{line_prev} = $self->{line};
4313     $self->{column_prev} = $self->{column};
4314     $self->{column}++;
4315     $self->{nc}
4316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317     } else {
4318     $self->{set_nc}->($self);
4319     }
4320    
4321     redo A;
4322 wakaba 1.1 } elsif ($self->{nc} == -1) {
4323 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4324    
4325     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4326     $self->{state} = DATA_STATE;
4327     $self->{s_kwd} = '';
4328     $self->{ct}->{quirks} = 1;
4329     } else {
4330    
4331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4332     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4333     }
4334    
4335 wakaba 1.1 ## reconsume
4336 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4337 wakaba 1.1 redo A;
4338 wakaba 1.16 } elsif ($self->{is_xml} and
4339     $self->{ct}->{type} == DOCTYPE_TOKEN and
4340     $self->{nc} == 0x005B) { # [
4341 wakaba 1.12
4342     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4343     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4344 wakaba 1.13 $self->{in_subset} = 1;
4345 wakaba 1.12
4346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347     $self->{line_prev} = $self->{line};
4348     $self->{column_prev} = $self->{column};
4349     $self->{column}++;
4350     $self->{nc}
4351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352     } else {
4353     $self->{set_nc}->($self);
4354     }
4355    
4356 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4357 wakaba 1.12 redo A;
4358 wakaba 1.1 } else {
4359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4360    
4361 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4362    
4363     #$self->{ct}->{quirks} = 1;
4364     $self->{state} = BOGUS_DOCTYPE_STATE;
4365     } else {
4366    
4367     $self->{state} = BOGUS_MD_STATE;
4368     }
4369    
4370 wakaba 1.1
4371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4372     $self->{line_prev} = $self->{line};
4373     $self->{column_prev} = $self->{column};
4374     $self->{column}++;
4375     $self->{nc}
4376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4377     } else {
4378     $self->{set_nc}->($self);
4379     }
4380    
4381     redo A;
4382     }
4383 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4384     if ($is_space->{$self->{nc}}) {
4385    
4386     ## Stay in the state.
4387    
4388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4389     $self->{line_prev} = $self->{line};
4390     $self->{column_prev} = $self->{column};
4391     $self->{column}++;
4392     $self->{nc}
4393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4394     } else {
4395     $self->{set_nc}->($self);
4396     }
4397    
4398     redo A;
4399     } elsif ($self->{nc} == 0x003E) { # >
4400    
4401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4402    
4403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4404     $self->{line_prev} = $self->{line};
4405     $self->{column_prev} = $self->{column};
4406     $self->{column}++;
4407     $self->{nc}
4408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4409     } else {
4410     $self->{set_nc}->($self);
4411     }
4412    
4413     return ($self->{ct}); # ENTITY
4414     redo A;
4415     } elsif ($self->{nc} == 0x004E or # N
4416     $self->{nc} == 0x006E) { # n
4417    
4418     $self->{state} = NDATA_STATE;
4419     $self->{kwd} = chr $self->{nc};
4420    
4421     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4422     $self->{line_prev} = $self->{line};
4423     $self->{column_prev} = $self->{column};
4424     $self->{column}++;
4425     $self->{nc}
4426     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4427     } else {
4428     $self->{set_nc}->($self);
4429     }
4430    
4431     redo A;
4432     } elsif ($self->{nc} == -1) {
4433    
4434     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4435     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4436     ## reconsume
4437     return ($self->{ct}); # ENTITY
4438     redo A;
4439     } else {
4440    
4441     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4442     $self->{state} = BOGUS_MD_STATE;
4443    
4444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4445     $self->{line_prev} = $self->{line};
4446     $self->{column_prev} = $self->{column};
4447     $self->{column}++;
4448     $self->{nc}
4449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4450     } else {
4451     $self->{set_nc}->($self);
4452     }
4453    
4454     redo A;
4455     }
4456 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4457     if ($self->{nc} == 0x003E) { # >
4458    
4459     $self->{state} = DATA_STATE;
4460 wakaba 1.5 $self->{s_kwd} = '';
4461 wakaba 1.1
4462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4463     $self->{line_prev} = $self->{line};
4464     $self->{column_prev} = $self->{column};
4465     $self->{column}++;
4466     $self->{nc}
4467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4468     } else {
4469     $self->{set_nc}->($self);
4470     }
4471    
4472    
4473     return ($self->{ct}); # DOCTYPE
4474    
4475     redo A;
4476 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4477 wakaba 1.13
4478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4480     $self->{in_subset} = 1;
4481    
4482 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4483     $self->{line_prev} = $self->{line};
4484     $self->{column_prev} = $self->{column};
4485     $self->{column}++;
4486     $self->{nc}
4487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4488     } else {
4489     $self->{set_nc}->($self);
4490     }
4491    
4492 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4493     redo A;
4494 wakaba 1.1 } elsif ($self->{nc} == -1) {
4495    
4496     $self->{state} = DATA_STATE;
4497 wakaba 1.5 $self->{s_kwd} = '';
4498 wakaba 1.1 ## reconsume
4499    
4500     return ($self->{ct}); # DOCTYPE
4501    
4502     redo A;
4503     } else {
4504    
4505     my $s = '';
4506 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4507 wakaba 1.1
4508     ## Stay in the state
4509    
4510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4511     $self->{line_prev} = $self->{line};
4512     $self->{column_prev} = $self->{column};
4513     $self->{column}++;
4514     $self->{nc}
4515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4516     } else {
4517     $self->{set_nc}->($self);
4518     }
4519    
4520     redo A;
4521     }
4522     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4523     ## NOTE: "CDATA section state" in the state is jointly implemented
4524     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4525     ## and |CDATA_SECTION_MSE2_STATE|.
4526 wakaba 1.10
4527     ## XML5: "CDATA state".
4528 wakaba 1.1
4529     if ($self->{nc} == 0x005D) { # ]
4530    
4531     $self->{state} = CDATA_SECTION_MSE1_STATE;
4532    
4533     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4534     $self->{line_prev} = $self->{line};
4535     $self->{column_prev} = $self->{column};
4536     $self->{column}++;
4537     $self->{nc}
4538     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4539     } else {
4540     $self->{set_nc}->($self);
4541     }
4542    
4543     redo A;
4544     } elsif ($self->{nc} == -1) {
4545 wakaba 1.6 if ($self->{is_xml}) {
4546 wakaba 1.8
4547 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4548 wakaba 1.8 } else {
4549    
4550 wakaba 1.6 }
4551    
4552 wakaba 1.1 $self->{state} = DATA_STATE;
4553 wakaba 1.5 $self->{s_kwd} = '';
4554 wakaba 1.10 ## Reconsume.
4555 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4556    
4557     return ($self->{ct}); # character
4558     } else {
4559    
4560     ## No token to emit. $self->{ct} is discarded.
4561     }
4562     redo A;
4563     } else {
4564    
4565     $self->{ct}->{data} .= chr $self->{nc};
4566     $self->{read_until}->($self->{ct}->{data},
4567     q<]>,
4568     length $self->{ct}->{data});
4569    
4570     ## Stay in the state.
4571    
4572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4573     $self->{line_prev} = $self->{line};
4574     $self->{column_prev} = $self->{column};
4575     $self->{column}++;
4576     $self->{nc}
4577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4578     } else {
4579     $self->{set_nc}->($self);
4580     }
4581    
4582     redo A;
4583     }
4584    
4585     ## ISSUE: "text tokens" in spec.
4586     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4587 wakaba 1.10 ## XML5: "CDATA bracket state".
4588    
4589 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4590    
4591     $self->{state} = CDATA_SECTION_MSE2_STATE;
4592    
4593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4594     $self->{line_prev} = $self->{line};
4595     $self->{column_prev} = $self->{column};
4596     $self->{column}++;
4597     $self->{nc}
4598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4599     } else {
4600     $self->{set_nc}->($self);
4601     }
4602    
4603     redo A;
4604     } else {
4605    
4606 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4607 wakaba 1.1 $self->{ct}->{data} .= ']';
4608 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4609 wakaba 1.1 ## Reconsume.
4610     redo A;
4611     }
4612     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4613 wakaba 1.10 ## XML5: "CDATA end state".
4614    
4615 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4616     $self->{state} = DATA_STATE;
4617 wakaba 1.5 $self->{s_kwd} = '';
4618 wakaba 1.1
4619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4620     $self->{line_prev} = $self->{line};
4621     $self->{column_prev} = $self->{column};
4622     $self->{column}++;
4623     $self->{nc}
4624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4625     } else {
4626     $self->{set_nc}->($self);
4627     }
4628    
4629     if (length $self->{ct}->{data}) { # character
4630    
4631     return ($self->{ct}); # character
4632     } else {
4633    
4634     ## No token to emit. $self->{ct} is discarded.
4635     }
4636     redo A;
4637     } elsif ($self->{nc} == 0x005D) { # ]
4638     # character
4639     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4640     ## Stay in the state.
4641    
4642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4643     $self->{line_prev} = $self->{line};
4644     $self->{column_prev} = $self->{column};
4645     $self->{column}++;
4646     $self->{nc}
4647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4648     } else {
4649     $self->{set_nc}->($self);
4650     }
4651    
4652     redo A;
4653     } else {
4654    
4655     $self->{ct}->{data} .= ']]'; # character
4656     $self->{state} = CDATA_SECTION_STATE;
4657 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4658 wakaba 1.1 redo A;
4659     }
4660     } elsif ($self->{state} == ENTITY_STATE) {
4661     if ($is_space->{$self->{nc}} or
4662     {
4663     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4664     $self->{entity_add} => 1,
4665     }->{$self->{nc}}) {
4666 wakaba 1.22 if ($self->{is_xml}) {
4667    
4668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4669     line => $self->{line_prev},
4670     column => $self->{column_prev}
4671     + ($self->{nc} == -1 ? 1 : 0));
4672     } else {
4673    
4674     ## No error
4675     }
4676 wakaba 1.1 ## Don't consume
4677     ## Return nothing.
4678     #
4679     } elsif ($self->{nc} == 0x0023) { # #
4680    
4681     $self->{state} = ENTITY_HASH_STATE;
4682 wakaba 1.12 $self->{kwd} = '#';
4683 wakaba 1.1
4684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4685     $self->{line_prev} = $self->{line};
4686     $self->{column_prev} = $self->{column};
4687     $self->{column}++;
4688     $self->{nc}
4689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4690     } else {
4691     $self->{set_nc}->($self);
4692     }
4693    
4694     redo A;
4695 wakaba 1.22 } elsif ($self->{is_xml} or
4696     (0x0041 <= $self->{nc} and
4697 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4698     (0x0061 <= $self->{nc} and
4699     $self->{nc} <= 0x007A)) { # a..z
4700    
4701     require Whatpm::_NamedEntityList;
4702     $self->{state} = ENTITY_NAME_STATE;
4703 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4704     $self->{entity__value} = $self->{kwd};
4705 wakaba 1.1 $self->{entity__match} = 0;
4706    
4707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4708     $self->{line_prev} = $self->{line};
4709     $self->{column_prev} = $self->{column};
4710     $self->{column}++;
4711     $self->{nc}
4712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4713     } else {
4714     $self->{set_nc}->($self);
4715     }
4716    
4717     redo A;
4718     } else {
4719    
4720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4721     ## Return nothing.
4722     #
4723     }
4724    
4725     ## NOTE: No character is consumed by the "consume a character
4726     ## reference" algorithm. In other word, there is an "&" character
4727     ## that does not introduce a character reference, which would be
4728     ## appended to the parent element or the attribute value in later
4729     ## process of the tokenizer.
4730    
4731     if ($self->{prev_state} == DATA_STATE) {
4732    
4733     $self->{state} = $self->{prev_state};
4734 wakaba 1.5 $self->{s_kwd} = '';
4735 wakaba 1.1 ## Reconsume.
4736     return ({type => CHARACTER_TOKEN, data => '&',
4737     line => $self->{line_prev},
4738     column => $self->{column_prev},
4739     });
4740     redo A;
4741     } else {
4742    
4743     $self->{ca}->{value} .= '&';
4744     $self->{state} = $self->{prev_state};
4745 wakaba 1.5 $self->{s_kwd} = '';
4746 wakaba 1.1 ## Reconsume.
4747     redo A;
4748     }
4749     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4750 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4751 wakaba 1.1
4752     $self->{state} = HEXREF_X_STATE;
4753 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4754 wakaba 1.1
4755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4756     $self->{line_prev} = $self->{line};
4757     $self->{column_prev} = $self->{column};
4758     $self->{column}++;
4759     $self->{nc}
4760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4761     } else {
4762     $self->{set_nc}->($self);
4763     }
4764    
4765     redo A;
4766 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4767    
4768     if ($self->{is_xml}) {
4769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4770     }
4771     $self->{state} = HEXREF_X_STATE;
4772     $self->{kwd} .= chr $self->{nc};
4773    
4774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4775     $self->{line_prev} = $self->{line};
4776     $self->{column_prev} = $self->{column};
4777     $self->{column}++;
4778     $self->{nc}
4779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4780     } else {
4781     $self->{set_nc}->($self);
4782     }
4783    
4784     redo A;
4785 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4786     $self->{nc} <= 0x0039) { # 0..9
4787    
4788     $self->{state} = NCR_NUM_STATE;
4789 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4790 wakaba 1.1
4791     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4792     $self->{line_prev} = $self->{line};
4793     $self->{column_prev} = $self->{column};
4794     $self->{column}++;
4795     $self->{nc}
4796     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4797     } else {
4798     $self->{set_nc}->($self);
4799     }
4800    
4801     redo A;
4802     } else {
4803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4804     line => $self->{line_prev},
4805     column => $self->{column_prev} - 1);
4806    
4807     ## NOTE: According to the spec algorithm, nothing is returned,
4808     ## and then "&#" is appended to the parent element or the attribute
4809     ## value in the later processing.
4810    
4811     if ($self->{prev_state} == DATA_STATE) {
4812    
4813     $self->{state} = $self->{prev_state};
4814 wakaba 1.5 $self->{s_kwd} = '';
4815 wakaba 1.1 ## Reconsume.
4816     return ({type => CHARACTER_TOKEN,
4817     data => '&#',
4818     line => $self->{line_prev},
4819     column => $self->{column_prev} - 1,
4820     });
4821     redo A;
4822     } else {
4823    
4824     $self->{ca}->{value} .= '&#';
4825     $self->{state} = $self->{prev_state};
4826 wakaba 1.5 $self->{s_kwd} = '';
4827 wakaba 1.1 ## Reconsume.
4828     redo A;
4829     }
4830     }
4831     } elsif ($self->{state} == NCR_NUM_STATE) {
4832     if (0x0030 <= $self->{nc} and
4833     $self->{nc} <= 0x0039) { # 0..9
4834    
4835 wakaba 1.12 $self->{kwd} *= 10;
4836     $self->{kwd} += $self->{nc} - 0x0030;
4837 wakaba 1.1
4838     ## Stay in the state.
4839    
4840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4841     $self->{line_prev} = $self->{line};
4842     $self->{column_prev} = $self->{column};
4843     $self->{column}++;
4844     $self->{nc}
4845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4846     } else {
4847     $self->{set_nc}->($self);
4848     }
4849    
4850     redo A;
4851     } elsif ($self->{nc} == 0x003B) { # ;
4852    
4853    
4854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4855     $self->{line_prev} = $self->{line};
4856     $self->{column_prev} = $self->{column};
4857     $self->{column}++;
4858     $self->{nc}
4859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4860     } else {
4861     $self->{set_nc}->($self);
4862     }
4863    
4864     #
4865     } else {
4866    
4867     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4868     ## Reconsume.
4869     #
4870     }
4871    
4872 wakaba 1.12 my $code = $self->{kwd};
4873 wakaba 1.1 my $l = $self->{line_prev};
4874     my $c = $self->{column_prev};
4875 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4876     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4877     ($self->{is_xml} and $code == 0x0000)) {
4878 wakaba 1.1
4879     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4880     text => (sprintf 'U+%04X', $code),
4881     line => $l, column => $c);
4882     $code = $charref_map->{$code};
4883     } elsif ($code > 0x10FFFF) {
4884    
4885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4886     text => (sprintf 'U-%08X', $code),
4887     line => $l, column => $c);
4888     $code = 0xFFFD;
4889     }
4890    
4891     if ($self->{prev_state} == DATA_STATE) {
4892    
4893     $self->{state} = $self->{prev_state};
4894 wakaba 1.5 $self->{s_kwd} = '';
4895 wakaba 1.1 ## Reconsume.
4896     return ({type => CHARACTER_TOKEN, data => chr $code,
4897 wakaba 1.7 has_reference => 1,
4898 wakaba 1.1 line => $l, column => $c,
4899     });
4900     redo A;
4901     } else {
4902    
4903     $self->{ca}->{value} .= chr $code;
4904     $self->{ca}->{has_reference} = 1;
4905     $self->{state} = $self->{prev_state};
4906 wakaba 1.5 $self->{s_kwd} = '';
4907 wakaba 1.1 ## Reconsume.
4908     redo A;
4909     }
4910     } elsif ($self->{state} == HEXREF_X_STATE) {
4911     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4912     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4913     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4914     # 0..9, A..F, a..f
4915    
4916     $self->{state} = HEXREF_HEX_STATE;
4917 wakaba 1.12 $self->{kwd} = 0;
4918 wakaba 1.1 ## Reconsume.
4919     redo A;
4920     } else {
4921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4922     line => $self->{line_prev},
4923     column => $self->{column_prev} - 2);
4924    
4925     ## NOTE: According to the spec algorithm, nothing is returned,
4926     ## and then "&#" followed by "X" or "x" is appended to the parent
4927     ## element or the attribute value in the later processing.
4928    
4929     if ($self->{prev_state} == DATA_STATE) {
4930    
4931     $self->{state} = $self->{prev_state};
4932 wakaba 1.5 $self->{s_kwd} = '';
4933 wakaba 1.1 ## Reconsume.
4934     return ({type => CHARACTER_TOKEN,
4935 wakaba 1.12 data => '&' . $self->{kwd},
4936 wakaba 1.1 line => $self->{line_prev},
4937 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4938 wakaba 1.1 });
4939     redo A;
4940     } else {
4941    
4942 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4943 wakaba 1.1 $self->{state} = $self->{prev_state};
4944 wakaba 1.5 $self->{s_kwd} = '';
4945 wakaba 1.1 ## Reconsume.
4946     redo A;
4947     }
4948     }
4949     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4950     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4951     # 0..9
4952    
4953 wakaba 1.12 $self->{kwd} *= 0x10;
4954     $self->{kwd} += $self->{nc} - 0x0030;
4955 wakaba 1.1 ## Stay in the state.
4956    
4957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4958     $self->{line_prev} = $self->{line};
4959     $self->{column_prev} = $self->{column};
4960     $self->{column}++;
4961     $self->{nc}
4962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4963     } else {
4964     $self->{set_nc}->($self);
4965     }
4966    
4967     redo A;
4968     } elsif (0x0061 <= $self->{nc} and
4969     $self->{nc} <= 0x0066) { # a..f
4970    
4971 wakaba 1.12 $self->{kwd} *= 0x10;
4972     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4973 wakaba 1.1 ## Stay in the state.
4974    
4975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4976     $self->{line_prev} = $self->{line};
4977     $self->{column_prev} = $self->{column};
4978     $self->{column}++;
4979     $self->{nc}
4980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4981     } else {
4982     $self->{set_nc}->($self);
4983     }
4984    
4985     redo A;
4986     } elsif (0x0041 <= $self->{nc} and
4987     $self->{nc} <= 0x0046) { # A..F
4988    
4989 wakaba 1.12 $self->{kwd} *= 0x10;
4990     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4991 wakaba 1.1 ## Stay in the state.
4992    
4993     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4994     $self->{line_prev} = $self->{line};
4995     $self->{column_prev} = $self->{column};
4996     $self->{column}++;
4997     $self->{nc}
4998     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4999     } else {
5000     $self->{set_nc}->($self);
5001     }
5002    
5003     redo A;
5004     } elsif ($self->{nc} == 0x003B) { # ;
5005    
5006    
5007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5008     $self->{line_prev} = $self->{line};
5009     $self->{column_prev} = $self->{column};
5010     $self->{column}++;
5011     $self->{nc}
5012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5013     } else {
5014     $self->{set_nc}->($self);
5015     }
5016    
5017     #
5018     } else {
5019    
5020     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5021     line => $self->{line},
5022     column => $self->{column});
5023     ## Reconsume.
5024     #
5025     }
5026    
5027 wakaba 1.12 my $code = $self->{kwd};
5028 wakaba 1.1 my $l = $self->{line_prev};
5029     my $c = $self->{column_prev};
5030 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5031     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5032     ($self->{is_xml} and $code == 0x0000)) {
5033 wakaba 1.1
5034     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5035     text => (sprintf 'U+%04X', $code),
5036     line => $l, column => $c);
5037     $code = $charref_map->{$code};
5038     } elsif ($code > 0x10FFFF) {
5039    
5040     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5041     text => (sprintf 'U-%08X', $code),
5042     line => $l, column => $c);
5043     $code = 0xFFFD;
5044     }
5045    
5046     if ($self->{prev_state} == DATA_STATE) {
5047    
5048     $self->{state} = $self->{prev_state};
5049 wakaba 1.5 $self->{s_kwd} = '';
5050 wakaba 1.1 ## Reconsume.
5051     return ({type => CHARACTER_TOKEN, data => chr $code,
5052 wakaba 1.7 has_reference => 1,
5053 wakaba 1.1 line => $l, column => $c,
5054     });
5055     redo A;
5056     } else {
5057    
5058     $self->{ca}->{value} .= chr $code;
5059     $self->{ca}->{has_reference} = 1;
5060     $self->{state} = $self->{prev_state};
5061 wakaba 1.5 $self->{s_kwd} = '';
5062 wakaba 1.1 ## Reconsume.
5063     redo A;
5064     }
5065     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5066 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5067     $self->{nc} <= 0x005A) or # x
5068     (0x0061 <= $self->{nc} and # a
5069     $self->{nc} <= 0x007A) or # z
5070     (0x0030 <= $self->{nc} and # 0
5071     $self->{nc} <= 0x0039) or # 9
5072 wakaba 1.22 $self->{nc} == 0x003B or # ;
5073     ($self->{is_xml} and
5074     not ($is_space->{$self->{nc}} or
5075     {
5076     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5077     $self->{entity_add} => 1,
5078     }->{$self->{nc}}))) {
5079 wakaba 1.1 our $EntityChar;
5080 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5081 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5082     $self->{ge}->{$self->{kwd}}) {
5083 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5084 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5085     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5086    
5087     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5088     } else {
5089     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5090    
5091     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5092     value => $self->{kwd});
5093     } else {
5094    
5095     }
5096     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5097     }
5098     } else {
5099     if ($self->{is_xml}) {
5100    
5101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5102     value => $self->{kwd},
5103     level => {
5104     'amp;' => $self->{level}->{warn},
5105     'quot;' => $self->{level}->{warn},
5106     'lt;' => $self->{level}->{warn},
5107     'gt;' => $self->{level}->{warn},
5108     'apos;' => $self->{level}->{warn},
5109     }->{$self->{kwd}} ||
5110     $self->{level}->{must});
5111     } else {
5112    
5113     }
5114     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5115     }
5116 wakaba 1.1 $self->{entity__match} = 1;
5117    
5118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5119     $self->{line_prev} = $self->{line};
5120     $self->{column_prev} = $self->{column};
5121     $self->{column}++;
5122     $self->{nc}
5123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5124     } else {
5125     $self->{set_nc}->($self);
5126     }
5127    
5128     #
5129     } else {
5130    
5131 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5132 wakaba 1.1 $self->{entity__match} = -1;
5133     ## Stay in the state.
5134    
5135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5136     $self->{line_prev} = $self->{line};
5137     $self->{column_prev} = $self->{column};
5138     $self->{column}++;
5139     $self->{nc}
5140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5141     } else {
5142     $self->{set_nc}->($self);
5143     }
5144    
5145     redo A;
5146     }
5147     } else {
5148    
5149     $self->{entity__value} .= chr $self->{nc};
5150     $self->{entity__match} *= 2;
5151     ## Stay in the state.
5152    
5153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5154     $self->{line_prev} = $self->{line};
5155     $self->{column_prev} = $self->{column};
5156     $self->{column}++;
5157     $self->{nc}
5158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5159     } else {
5160     $self->{set_nc}->($self);
5161     }
5162    
5163     redo A;
5164     }
5165     }
5166    
5167     my $data;
5168     my $has_ref;
5169     if ($self->{entity__match} > 0) {
5170    
5171     $data = $self->{entity__value};
5172     $has_ref = 1;
5173     #
5174     } elsif ($self->{entity__match} < 0) {
5175     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5176     if ($self->{prev_state} != DATA_STATE and # in attribute
5177     $self->{entity__match} < -1) {
5178    
5179 wakaba 1.12 $data = '&' . $self->{kwd};
5180 wakaba 1.1 #
5181     } else {
5182    
5183     $data = $self->{entity__value};
5184     $has_ref = 1;
5185     #
5186     }
5187     } else {
5188    
5189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5190     line => $self->{line_prev},
5191 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5192     $data = '&' . $self->{kwd};
5193 wakaba 1.1 #
5194     }
5195    
5196     ## NOTE: In these cases, when a character reference is found,
5197     ## it is consumed and a character token is returned, or, otherwise,
5198     ## nothing is consumed and returned, according to the spec algorithm.
5199     ## In this implementation, anything that has been examined by the
5200     ## tokenizer is appended to the parent element or the attribute value
5201     ## as string, either literal string when no character reference or
5202     ## entity-replaced string otherwise, in this stage, since any characters
5203     ## that would not be consumed are appended in the data state or in an
5204     ## appropriate attribute value state anyway.
5205    
5206     if ($self->{prev_state} == DATA_STATE) {
5207    
5208     $self->{state} = $self->{prev_state};
5209 wakaba 1.5 $self->{s_kwd} = '';
5210 wakaba 1.1 ## Reconsume.
5211     return ({type => CHARACTER_TOKEN,
5212     data => $data,
5213 wakaba 1.7 has_reference => $has_ref,
5214 wakaba 1.1 line => $self->{line_prev},
5215 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5216 wakaba 1.1 });
5217     redo A;
5218     } else {
5219    
5220     $self->{ca}->{value} .= $data;
5221     $self->{ca}->{has_reference} = 1 if $has_ref;
5222     $self->{state} = $self->{prev_state};
5223 wakaba 1.5 $self->{s_kwd} = '';
5224 wakaba 1.1 ## Reconsume.
5225     redo A;
5226     }
5227 wakaba 1.8
5228     ## XML-only states
5229    
5230     } elsif ($self->{state} == PI_STATE) {
5231 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5232    
5233 wakaba 1.8 if ($is_space->{$self->{nc}} or
5234 wakaba 1.14 $self->{nc} == 0x003F or # ?
5235 wakaba 1.8 $self->{nc} == -1) {
5236 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5237     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5238     ## "DOCTYPE pi state": Parse error, switch to the "data
5239     ## state".
5240 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5241     line => $self->{line_prev},
5242     column => $self->{column_prev}
5243     - 1 * ($self->{nc} != -1));
5244     $self->{state} = BOGUS_COMMENT_STATE;
5245     ## Reconsume.
5246     $self->{ct} = {type => COMMENT_TOKEN,
5247     data => '?',
5248     line => $self->{line_prev},
5249     column => $self->{column_prev}
5250     - 1 * ($self->{nc} != -1),
5251     };
5252     redo A;
5253     } else {
5254 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5255 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5256     target => chr $self->{nc},
5257     data => '',
5258     line => $self->{line_prev},
5259     column => $self->{column_prev} - 1,
5260     };
5261     $self->{state} = PI_TARGET_STATE;
5262    
5263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5264     $self->{line_prev} = $self->{line};
5265     $self->{column_prev} = $self->{column};
5266     $self->{column}++;
5267     $self->{nc}
5268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5269     } else {
5270     $self->{set_nc}->($self);
5271     }
5272    
5273     redo A;
5274     }
5275     } elsif ($self->{state} == PI_TARGET_STATE) {
5276     if ($is_space->{$self->{nc}}) {
5277     $self->{state} = PI_TARGET_AFTER_STATE;
5278    
5279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5280     $self->{line_prev} = $self->{line};
5281     $self->{column_prev} = $self->{column};
5282     $self->{column}++;
5283     $self->{nc}
5284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5285     } else {
5286     $self->{set_nc}->($self);
5287     }
5288    
5289     redo A;
5290     } elsif ($self->{nc} == -1) {
5291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5292 wakaba 1.13 if ($self->{in_subset}) {
5293     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5294     } else {
5295     $self->{state} = DATA_STATE;
5296     $self->{s_kwd} = '';
5297     }
5298 wakaba 1.8 ## Reconsume.
5299     return ($self->{ct}); # pi
5300     redo A;
5301     } elsif ($self->{nc} == 0x003F) { # ?
5302     $self->{state} = PI_AFTER_STATE;
5303    
5304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5305     $self->{line_prev} = $self->{line};
5306     $self->{column_prev} = $self->{column};
5307     $self->{column}++;
5308     $self->{nc}
5309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5310     } else {
5311     $self->{set_nc}->($self);
5312     }
5313    
5314     redo A;
5315     } else {
5316     ## XML5: typo ("tag name" -> "target")
5317     $self->{ct}->{target} .= chr $self->{nc}; # pi
5318    
5319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5320     $self->{line_prev} = $self->{line};
5321     $self->{column_prev} = $self->{column};
5322     $self->{column}++;
5323     $self->{nc}
5324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5325     } else {
5326     $self->{set_nc}->($self);
5327     }
5328    
5329     redo A;
5330     }
5331     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5332     if ($is_space->{$self->{nc}}) {
5333     ## Stay in the state.
5334    
5335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5336     $self->{line_prev} = $self->{line};
5337     $self->{column_prev} = $self->{column};
5338     $self->{column}++;
5339     $self->{nc}
5340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5341     } else {
5342     $self->{set_nc}->($self);
5343     }
5344    
5345     redo A;
5346     } else {
5347     $self->{state} = PI_DATA_STATE;
5348     ## Reprocess.
5349     redo A;
5350     }
5351     } elsif ($self->{state} == PI_DATA_STATE) {
5352     if ($self->{nc} == 0x003F) { # ?
5353     $self->{state} = PI_DATA_AFTER_STATE;
5354    
5355     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5356     $self->{line_prev} = $self->{line};
5357     $self->{column_prev} = $self->{column};
5358     $self->{column}++;
5359     $self->{nc}
5360     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5361     } else {
5362     $self->{set_nc}->($self);
5363     }
5364    
5365     redo A;
5366     } elsif ($self->{nc} == -1) {
5367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5368 wakaba 1.13 if ($self->{in_subset}) {
5369 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5370 wakaba 1.13 } else {
5371     $self->{state} = DATA_STATE;
5372     $self->{s_kwd} = '';
5373     }
5374 wakaba 1.8 ## Reprocess.
5375     return ($self->{ct}); # pi
5376     redo A;
5377     } else {
5378     $self->{ct}->{data} .= chr $self->{nc}; # pi
5379     $self->{read_until}->($self->{ct}->{data}, q[?],
5380     length $self->{ct}->{data});
5381     ## Stay in the state.
5382    
5383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384     $self->{line_prev} = $self->{line};
5385     $self->{column_prev} = $self->{column};
5386     $self->{column}++;
5387     $self->{nc}
5388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389     } else {
5390     $self->{set_nc}->($self);
5391     }
5392    
5393     ## Reprocess.
5394     redo A;
5395     }
5396     } elsif ($self->{state} == PI_AFTER_STATE) {
5397 wakaba 1.14 ## XML5: Part of "Pi after state".
5398    
5399 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5400 wakaba 1.13 if ($self->{in_subset}) {
5401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5402     } else {
5403     $self->{state} = DATA_STATE;
5404     $self->{s_kwd} = '';
5405     }
5406 wakaba 1.8
5407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5408     $self->{line_prev} = $self->{line};
5409     $self->{column_prev} = $self->{column};
5410     $self->{column}++;
5411     $self->{nc}
5412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5413     } else {
5414     $self->{set_nc}->($self);
5415     }
5416    
5417     return ($self->{ct}); # pi
5418     redo A;
5419     } elsif ($self->{nc} == 0x003F) { # ?
5420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5421     line => $self->{line_prev},
5422     column => $self->{column_prev}); ## XML5: no error
5423     $self->{ct}->{data} .= '?';
5424     $self->{state} = PI_DATA_AFTER_STATE;
5425    
5426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5427     $self->{line_prev} = $self->{line};
5428     $self->{column_prev} = $self->{column};
5429     $self->{column}++;
5430     $self->{nc}
5431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5432     } else {
5433     $self->{set_nc}->($self);
5434     }
5435    
5436     redo A;
5437     } else {
5438     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5439     line => $self->{line_prev},
5440     column => $self->{column_prev}
5441     + 1 * ($self->{nc} == -1)); ## XML5: no error
5442     $self->{ct}->{data} .= '?'; ## XML5: not appended
5443     $self->{state} = PI_DATA_STATE;
5444     ## Reprocess.
5445     redo A;
5446     }
5447     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5448 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5449    
5450 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5451 wakaba 1.13 if ($self->{in_subset}) {
5452     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5453     } else {
5454     $self->{state} = DATA_STATE;
5455     $self->{s_kwd} = '';
5456     }
5457 wakaba 1.8
5458     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5459     $self->{line_prev} = $self->{line};
5460     $self->{column_prev} = $self->{column};
5461     $self->{column}++;
5462     $self->{nc}
5463     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5464     } else {
5465     $self->{set_nc}->($self);
5466     }
5467    
5468     return ($self->{ct}); # pi
5469     redo A;
5470     } elsif ($self->{nc} == 0x003F) { # ?
5471     $self->{ct}->{data} .= '?';
5472     ## Stay in the state.
5473    
5474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475     $self->{line_prev} = $self->{line};
5476     $self->{column_prev} = $self->{column};
5477     $self->{column}++;
5478     $self->{nc}
5479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480     } else {
5481     $self->{set_nc}->($self);
5482     }
5483    
5484     redo A;
5485     } else {
5486     $self->{ct}->{data} .= '?'; ## XML5: not appended
5487     $self->{state} = PI_DATA_STATE;
5488     ## Reprocess.
5489     redo A;
5490     }
5491 wakaba 1.12
5492     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5493     if ($self->{nc} == 0x003C) { # <
5494 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5495 wakaba 1.12
5496     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5497     $self->{line_prev} = $self->{line};
5498     $self->{column_prev} = $self->{column};
5499     $self->{column}++;
5500     $self->{nc}
5501     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5502     } else {
5503     $self->{set_nc}->($self);
5504     }
5505    
5506     redo A;
5507     } elsif ($self->{nc} == 0x0025) { # %
5508     ## XML5: Not defined yet.
5509    
5510     ## TODO:
5511 wakaba 1.24
5512     if (not $self->{stop_processing} and
5513     not $self->{document}->xml_standalone) {
5514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5515     level => $self->{level}->{info});
5516     $self->{stop_processing} = 1;
5517     }
5518    
5519 wakaba 1.12
5520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5521     $self->{line_prev} = $self->{line};
5522     $self->{column_prev} = $self->{column};
5523     $self->{column}++;
5524     $self->{nc}
5525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5526     } else {
5527     $self->{set_nc}->($self);
5528     }
5529    
5530     redo A;
5531     } elsif ($self->{nc} == 0x005D) { # ]
5532 wakaba 1.13 delete $self->{in_subset};
5533 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5534    
5535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5536     $self->{line_prev} = $self->{line};
5537     $self->{column_prev} = $self->{column};
5538     $self->{column}++;
5539     $self->{nc}
5540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5541     } else {
5542     $self->{set_nc}->($self);
5543     }
5544    
5545     redo A;
5546     } elsif ($is_space->{$self->{nc}}) {
5547     ## Stay in the state.
5548    
5549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5550     $self->{line_prev} = $self->{line};
5551     $self->{column_prev} = $self->{column};
5552     $self->{column}++;
5553     $self->{nc}
5554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5555     } else {
5556     $self->{set_nc}->($self);
5557     }
5558    
5559     redo A;
5560     } elsif ($self->{nc} == -1) {
5561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5562 wakaba 1.13 delete $self->{in_subset};
5563 wakaba 1.12 $self->{state} = DATA_STATE;
5564     $self->{s_kwd} = '';
5565     ## Reconsume.
5566 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5567 wakaba 1.12 redo A;
5568     } else {
5569     unless ($self->{internal_subset_tainted}) {
5570     ## XML5: No parse error.
5571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5572     $self->{internal_subset_tainted} = 1;
5573     }
5574     ## Stay in the state.
5575    
5576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577     $self->{line_prev} = $self->{line};
5578     $self->{column_prev} = $self->{column};
5579     $self->{column}++;
5580     $self->{nc}
5581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582     } else {
5583     $self->{set_nc}->($self);
5584     }
5585    
5586     redo A;
5587     }
5588     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5589     if ($self->{nc} == 0x003E) { # >
5590     $self->{state} = DATA_STATE;
5591     $self->{s_kwd} = '';
5592    
5593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5594     $self->{line_prev} = $self->{line};
5595     $self->{column_prev} = $self->{column};
5596     $self->{column}++;
5597     $self->{nc}
5598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5599     } else {
5600     $self->{set_nc}->($self);
5601     }
5602    
5603 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5604 wakaba 1.12 redo A;
5605     } elsif ($self->{nc} == -1) {
5606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5607     $self->{state} = DATA_STATE;
5608     $self->{s_kwd} = '';
5609     ## Reconsume.
5610 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5611 wakaba 1.12 redo A;
5612     } else {
5613     ## XML5: No parse error and stay in the state.
5614     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5615    
5616 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5617    
5618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5619     $self->{line_prev} = $self->{line};
5620     $self->{column_prev} = $self->{column};
5621     $self->{column}++;
5622     $self->{nc}
5623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5624     } else {
5625     $self->{set_nc}->($self);
5626     }
5627    
5628     redo A;
5629     }
5630     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5631     if ($self->{nc} == 0x003E) { # >
5632     $self->{state} = DATA_STATE;
5633     $self->{s_kwd} = '';
5634    
5635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5636     $self->{line_prev} = $self->{line};
5637     $self->{column_prev} = $self->{column};
5638     $self->{column}++;
5639     $self->{nc}
5640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5641     } else {
5642     $self->{set_nc}->($self);
5643     }
5644    
5645     return ({type => END_OF_DOCTYPE_TOKEN});
5646     redo A;
5647     } elsif ($self->{nc} == -1) {
5648     $self->{state} = DATA_STATE;
5649     $self->{s_kwd} = '';
5650     ## Reconsume.
5651     return ({type => END_OF_DOCTYPE_TOKEN});
5652     redo A;
5653     } else {
5654     ## Stay in the state.
5655    
5656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5657     $self->{line_prev} = $self->{line};
5658     $self->{column_prev} = $self->{column};
5659     $self->{column}++;
5660     $self->{nc}
5661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5662     } else {
5663     $self->{set_nc}->($self);
5664     }
5665    
5666     redo A;
5667     }
5668     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5669     if ($self->{nc} == 0x0021) { # !
5670 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5671 wakaba 1.13
5672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5673     $self->{line_prev} = $self->{line};
5674     $self->{column_prev} = $self->{column};
5675     $self->{column}++;
5676     $self->{nc}
5677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5678     } else {
5679     $self->{set_nc}->($self);
5680     }
5681    
5682     redo A;
5683     } elsif ($self->{nc} == 0x003F) { # ?
5684     $self->{state} = PI_STATE;
5685    
5686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5687     $self->{line_prev} = $self->{line};
5688     $self->{column_prev} = $self->{column};
5689     $self->{column}++;
5690     $self->{nc}
5691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5692     } else {
5693     $self->{set_nc}->($self);
5694     }
5695    
5696     redo A;
5697     } elsif ($self->{nc} == -1) {
5698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5699     $self->{state} = DATA_STATE;
5700     $self->{s_kwd} = '';
5701     ## Reconsume.
5702     redo A;
5703     } else {
5704     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5705     line => $self->{line_prev},
5706     column => $self->{column_prev});
5707     $self->{state} = BOGUS_COMMENT_STATE;
5708     $self->{ct} = {type => COMMENT_TOKEN,
5709     data => '',
5710     }; ## NOTE: Will be discarded.
5711 wakaba 1.12
5712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5713     $self->{line_prev} = $self->{line};
5714     $self->{column_prev} = $self->{column};
5715     $self->{column}++;
5716     $self->{nc}
5717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5718     } else {
5719     $self->{set_nc}->($self);
5720     }
5721    
5722     redo A;
5723     }
5724 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5725     ## XML5: "DOCTYPE markup declaration state".
5726    
5727     if ($self->{nc} == 0x002D) { # -
5728     $self->{state} = MD_HYPHEN_STATE;
5729    
5730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5731     $self->{line_prev} = $self->{line};
5732     $self->{column_prev} = $self->{column};
5733     $self->{column}++;
5734     $self->{nc}
5735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5736     } else {
5737     $self->{set_nc}->($self);
5738     }
5739    
5740     redo A;
5741 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5742     $self->{nc} == 0x0065) { # e
5743 wakaba 1.14 $self->{state} = MD_E_STATE;
5744     $self->{kwd} = chr $self->{nc};
5745    
5746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5747     $self->{line_prev} = $self->{line};
5748     $self->{column_prev} = $self->{column};
5749     $self->{column}++;
5750     $self->{nc}
5751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5752     } else {
5753     $self->{set_nc}->($self);
5754     }
5755    
5756     redo A;
5757 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5758     $self->{nc} == 0x0061) { # a
5759 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5760     $self->{kwd} = chr $self->{nc};
5761    
5762     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5763     $self->{line_prev} = $self->{line};
5764     $self->{column_prev} = $self->{column};
5765     $self->{column}++;
5766     $self->{nc}
5767     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5768     } else {
5769     $self->{set_nc}->($self);
5770     }
5771    
5772     redo A;
5773 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5774     $self->{nc} == 0x006E) { # n
5775 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5776     $self->{kwd} = chr $self->{nc};
5777    
5778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5779     $self->{line_prev} = $self->{line};
5780     $self->{column_prev} = $self->{column};
5781     $self->{column}++;
5782     $self->{nc}
5783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5784     } else {
5785     $self->{set_nc}->($self);
5786     }
5787    
5788     redo A;
5789     } else {
5790     #
5791     }
5792    
5793     ## XML5: No parse error.
5794     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5795     line => $self->{line_prev},
5796     column => $self->{column_prev} - 1);
5797     ## Reconsume.
5798     $self->{state} = BOGUS_COMMENT_STATE;
5799     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5800     redo A;
5801     } elsif ($self->{state} == MD_E_STATE) {
5802 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5803     $self->{nc} == 0x006E) { # n
5804 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5805     $self->{kwd} .= chr $self->{nc};
5806    
5807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808     $self->{line_prev} = $self->{line};
5809     $self->{column_prev} = $self->{column};
5810     $self->{column}++;
5811     $self->{nc}
5812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813     } else {
5814     $self->{set_nc}->($self);
5815     }
5816    
5817     redo A;
5818 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5819     $self->{nc} == 0x006C) { # l
5820 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5821     $self->{state} = MD_ELEMENT_STATE;
5822     $self->{kwd} .= chr $self->{nc};
5823    
5824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5825     $self->{line_prev} = $self->{line};
5826     $self->{column_prev} = $self->{column};
5827     $self->{column}++;
5828     $self->{nc}
5829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5830     } else {
5831     $self->{set_nc}->($self);
5832     }
5833    
5834     redo A;
5835     } else {
5836     ## XML5: No parse error.
5837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5838     line => $self->{line_prev},
5839     column => $self->{column_prev} - 2
5840     + 1 * ($self->{nc} == -1));
5841     ## Reconsume.
5842     $self->{state} = BOGUS_COMMENT_STATE;
5843     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5844     redo A;
5845     }
5846     } elsif ($self->{state} == MD_ENTITY_STATE) {
5847 wakaba 1.17 if ($self->{nc} == [
5848     undef,
5849     undef,
5850     0x0054, # T
5851     0x0049, # I
5852     0x0054, # T
5853     ]->[length $self->{kwd}] or
5854     $self->{nc} == [
5855     undef,
5856     undef,
5857     0x0074, # t
5858     0x0069, # i
5859     0x0074, # t
5860     ]->[length $self->{kwd}]) {
5861 wakaba 1.14 ## Stay in the state.
5862     $self->{kwd} .= chr $self->{nc};
5863    
5864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5865     $self->{line_prev} = $self->{line};
5866     $self->{column_prev} = $self->{column};
5867     $self->{column}++;
5868     $self->{nc}
5869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5870     } else {
5871     $self->{set_nc}->($self);
5872     }
5873    
5874     redo A;
5875 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5876     ($self->{nc} == 0x0059 or # Y
5877     $self->{nc} == 0x0079)) { # y
5878     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5879     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5880     text => 'ENTITY',
5881     line => $self->{line_prev},
5882     column => $self->{column_prev} - 4);
5883     }
5884     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5885 wakaba 1.14 line => $self->{line_prev},
5886     column => $self->{column_prev} - 6};
5887     $self->{state} = DOCTYPE_MD_STATE;
5888    
5889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890     $self->{line_prev} = $self->{line};
5891     $self->{column_prev} = $self->{column};
5892     $self->{column}++;
5893     $self->{nc}
5894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895     } else {
5896     $self->{set_nc}->($self);
5897     }
5898    
5899     redo A;
5900     } else {
5901     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5902     line => $self->{line_prev},
5903     column => $self->{column_prev} - 1
5904     - (length $self->{kwd})
5905     + 1 * ($self->{nc} == -1));
5906     $self->{state} = BOGUS_COMMENT_STATE;
5907     ## Reconsume.
5908     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5909     redo A;
5910     }
5911     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5912 wakaba 1.17 if ($self->{nc} == [
5913     undef,
5914     undef,
5915     0x0045, # E
5916     0x004D, # M
5917     0x0045, # E
5918     0x004E, # N
5919     ]->[length $self->{kwd}] or
5920     $self->{nc} == [
5921     undef,
5922     undef,
5923     0x0065, # e
5924     0x006D, # m
5925     0x0065, # e
5926     0x006E, # n
5927     ]->[length $self->{kwd}]) {
5928 wakaba 1.14 ## Stay in the state.
5929     $self->{kwd} .= chr $self->{nc};
5930    
5931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5932     $self->{line_prev} = $self->{line};
5933     $self->{column_prev} = $self->{column};
5934     $self->{column}++;
5935     $self->{nc}
5936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5937     } else {
5938     $self->{set_nc}->($self);
5939     }
5940    
5941     redo A;
5942 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5943     ($self->{nc} == 0x0054 or # T
5944     $self->{nc} == 0x0074)) { # t
5945     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5946     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5947     text => 'ELEMENT',
5948     line => $self->{line_prev},
5949     column => $self->{column_prev} - 5);
5950     }
5951 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5952     line => $self->{line_prev},
5953 wakaba 1.23 column => $self->{column_prev} - 7};
5954 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5955    
5956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5957     $self->{line_prev} = $self->{line};
5958     $self->{column_prev} = $self->{column};
5959     $self->{column}++;
5960     $self->{nc}
5961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5962     } else {
5963     $self->{set_nc}->($self);
5964     }
5965    
5966     redo A;
5967     } else {
5968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5969     line => $self->{line_prev},
5970     column => $self->{column_prev} - 1
5971     - (length $self->{kwd})
5972     + 1 * ($self->{nc} == -1));
5973     $self->{state} = BOGUS_COMMENT_STATE;
5974     ## Reconsume.
5975     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5976     redo A;
5977     }
5978     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5979 wakaba 1.17 if ($self->{nc} == [
5980     undef,
5981     0x0054, # T
5982     0x0054, # T
5983     0x004C, # L
5984     0x0049, # I
5985     0x0053, # S
5986     ]->[length $self->{kwd}] or
5987     $self->{nc} == [
5988     undef,
5989     0x0074, # t
5990     0x0074, # t
5991     0x006C, # l
5992     0x0069, # i
5993     0x0073, # s
5994     ]->[length $self->{kwd}]) {
5995 wakaba 1.14 ## Stay in the state.
5996     $self->{kwd} .= chr $self->{nc};
5997    
5998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5999     $self->{line_prev} = $self->{line};
6000     $self->{column_prev} = $self->{column};
6001     $self->{column}++;
6002     $self->{nc}
6003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6004     } else {
6005     $self->{set_nc}->($self);
6006     }
6007    
6008     redo A;
6009 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6010     ($self->{nc} == 0x0054 or # T
6011     $self->{nc} == 0x0074)) { # t
6012     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6013     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6014     text => 'ATTLIST',
6015     line => $self->{line_prev},
6016     column => $self->{column_prev} - 5);
6017     }
6018 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6019 wakaba 1.15 attrdefs => [],
6020 wakaba 1.14 line => $self->{line_prev},
6021 wakaba 1.23 column => $self->{column_prev} - 7};
6022 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6023    
6024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6025     $self->{line_prev} = $self->{line};
6026     $self->{column_prev} = $self->{column};
6027     $self->{column}++;
6028     $self->{nc}
6029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6030     } else {
6031     $self->{set_nc}->($self);
6032     }
6033    
6034     redo A;
6035     } else {
6036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6037     line => $self->{line_prev},
6038     column => $self->{column_prev} - 1
6039     - (length $self->{kwd})
6040     + 1 * ($self->{nc} == -1));
6041     $self->{state} = BOGUS_COMMENT_STATE;
6042     ## Reconsume.
6043     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6044     redo A;
6045     }
6046     } elsif ($self->{state} == MD_NOTATION_STATE) {
6047 wakaba 1.17 if ($self->{nc} == [
6048     undef,
6049     0x004F, # O
6050     0x0054, # T
6051     0x0041, # A
6052     0x0054, # T
6053     0x0049, # I
6054     0x004F, # O
6055     ]->[length $self->{kwd}] or
6056     $self->{nc} == [
6057     undef,
6058     0x006F, # o
6059     0x0074, # t
6060     0x0061, # a
6061     0x0074, # t
6062     0x0069, # i
6063     0x006F, # o
6064     ]->[length $self->{kwd}]) {
6065 wakaba 1.14 ## Stay in the state.
6066     $self->{kwd} .= chr $self->{nc};
6067    
6068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6069     $self->{line_prev} = $self->{line};
6070     $self->{column_prev} = $self->{column};
6071     $self->{column}++;
6072     $self->{nc}
6073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6074     } else {
6075     $self->{set_nc}->($self);
6076     }
6077    
6078     redo A;
6079 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6080     ($self->{nc} == 0x004E or # N
6081     $self->{nc} == 0x006E)) { # n
6082     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6084     text => 'NOTATION',
6085     line => $self->{line_prev},
6086     column => $self->{column_prev} - 6);
6087     }
6088 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6089     line => $self->{line_prev},
6090 wakaba 1.23 column => $self->{column_prev} - 8};
6091 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6092    
6093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6094     $self->{line_prev} = $self->{line};
6095     $self->{column_prev} = $self->{column};
6096     $self->{column}++;
6097     $self->{nc}
6098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6099     } else {
6100     $self->{set_nc}->($self);
6101     }
6102    
6103     redo A;
6104     } else {
6105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6106     line => $self->{line_prev},
6107     column => $self->{column_prev} - 1
6108     - (length $self->{kwd})
6109     + 1 * ($self->{nc} == -1));
6110     $self->{state} = BOGUS_COMMENT_STATE;
6111     ## Reconsume.
6112     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6113     redo A;
6114     }
6115     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6116     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6117     ## "DOCTYPE NOTATION state".
6118    
6119     if ($is_space->{$self->{nc}}) {
6120     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6121     $self->{state} = BEFORE_MD_NAME_STATE;
6122    
6123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6124     $self->{line_prev} = $self->{line};
6125     $self->{column_prev} = $self->{column};
6126     $self->{column}++;
6127     $self->{nc}
6128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6129     } else {
6130     $self->{set_nc}->($self);
6131     }
6132    
6133     redo A;
6134     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6135     $self->{nc} == 0x0025) { # %
6136     ## XML5: Switch to the "DOCTYPE bogus comment state".
6137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6138     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6139    
6140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6141     $self->{line_prev} = $self->{line};
6142     $self->{column_prev} = $self->{column};
6143     $self->{column}++;
6144     $self->{nc}
6145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6146     } else {
6147     $self->{set_nc}->($self);
6148     }
6149    
6150     redo A;
6151     } elsif ($self->{nc} == -1) {
6152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6153     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6154     ## Reconsume.
6155     redo A;
6156     } elsif ($self->{nc} == 0x003E) { # >
6157     ## XML5: Switch to the "DOCTYPE bogus comment state".
6158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6159     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6160    
6161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162     $self->{line_prev} = $self->{line};
6163     $self->{column_prev} = $self->{column};
6164     $self->{column}++;
6165     $self->{nc}
6166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167     } else {
6168     $self->{set_nc}->($self);
6169     }
6170    
6171     redo A;
6172     } else {
6173     ## XML5: Switch to the "DOCTYPE bogus comment state".
6174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6175     $self->{state} = BEFORE_MD_NAME_STATE;
6176     redo A;
6177     }
6178     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6179     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6180     ## before state", "DOCTYPE ATTLIST name before state".
6181    
6182     if ($is_space->{$self->{nc}}) {
6183     ## Stay in the state.
6184    
6185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6186     $self->{line_prev} = $self->{line};
6187     $self->{column_prev} = $self->{column};
6188     $self->{column}++;
6189     $self->{nc}
6190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6191     } else {
6192     $self->{set_nc}->($self);
6193     }
6194    
6195     redo A;
6196     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6197     $self->{nc} == 0x0025) { # %
6198     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6199    
6200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6201     $self->{line_prev} = $self->{line};
6202     $self->{column_prev} = $self->{column};
6203     $self->{column}++;
6204     $self->{nc}
6205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6206     } else {
6207     $self->{set_nc}->($self);
6208     }
6209    
6210     redo A;
6211     } elsif ($self->{nc} == 0x003E) { # >
6212     ## XML5: Same as "Anything else".
6213     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6214     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6215    
6216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6217     $self->{line_prev} = $self->{line};
6218     $self->{column_prev} = $self->{column};
6219     $self->{column}++;
6220     $self->{nc}
6221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6222     } else {
6223     $self->{set_nc}->($self);
6224     }
6225    
6226     redo A;
6227     } elsif ($self->{nc} == -1) {
6228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6230     ## Reconsume.
6231     redo A;
6232     } else {
6233     ## XML5: [ATTLIST] Not defined yet.
6234     $self->{ct}->{name} .= chr $self->{nc};
6235     $self->{state} = MD_NAME_STATE;
6236    
6237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6238     $self->{line_prev} = $self->{line};
6239     $self->{column_prev} = $self->{column};
6240     $self->{column}++;
6241     $self->{nc}
6242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6243     } else {
6244     $self->{set_nc}->($self);
6245     }
6246    
6247     redo A;
6248     }
6249     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6250     if ($is_space->{$self->{nc}}) {
6251     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6252     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6253     $self->{state} = BEFORE_MD_NAME_STATE;
6254 wakaba 1.8
6255 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6256     $self->{line_prev} = $self->{line};
6257     $self->{column_prev} = $self->{column};
6258     $self->{column}++;
6259     $self->{nc}
6260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6261     } else {
6262     $self->{set_nc}->($self);
6263     }
6264    
6265     redo A;
6266     } elsif ($self->{nc} == 0x003E) { # >
6267     ## XML5: Same as "Anything else".
6268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6269     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6270    
6271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6272     $self->{line_prev} = $self->{line};
6273     $self->{column_prev} = $self->{column};
6274     $self->{column}++;
6275     $self->{nc}
6276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6277     } else {
6278     $self->{set_nc}->($self);
6279     }
6280    
6281     redo A;
6282     } elsif ($self->{nc} == -1) {
6283     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6284     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6285     ## Reconsume.
6286     redo A;
6287     } else {
6288     ## XML5: No parse error.
6289     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6290     $self->{state} = BOGUS_COMMENT_STATE;
6291     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6292     ## Reconsume.
6293     redo A;
6294     }
6295     } elsif ($self->{state} == MD_NAME_STATE) {
6296     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6297    
6298     if ($is_space->{$self->{nc}}) {
6299 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6300     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6301     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6302 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6303 wakaba 1.16 } else { # ENTITY/NOTATION
6304     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6305     }
6306 wakaba 1.14
6307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6308     $self->{line_prev} = $self->{line};
6309     $self->{column_prev} = $self->{column};
6310     $self->{column}++;
6311     $self->{nc}
6312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6313     } else {
6314     $self->{set_nc}->($self);
6315     }
6316    
6317     redo A;
6318     } elsif ($self->{nc} == 0x003E) { # >
6319     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6320     #
6321     } else {
6322 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6323 wakaba 1.14 }
6324     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6325    
6326     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6327     $self->{line_prev} = $self->{line};
6328     $self->{column_prev} = $self->{column};
6329     $self->{column}++;
6330     $self->{nc}
6331     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6332     } else {
6333     $self->{set_nc}->($self);
6334     }
6335    
6336     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6337     redo A;
6338     } elsif ($self->{nc} == -1) {
6339     ## XML5: [ATTLIST] No parse error.
6340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6342     ## Reconsume.
6343     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6344     redo A;
6345     } else {
6346     ## XML5: [ATTLIST] Not defined yet.
6347     $self->{ct}->{name} .= chr $self->{nc};
6348     ## Stay in the state.
6349    
6350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6351     $self->{line_prev} = $self->{line};
6352     $self->{column_prev} = $self->{column};
6353     $self->{column}++;
6354     $self->{nc}
6355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6356     } else {
6357     $self->{set_nc}->($self);
6358     }
6359    
6360     redo A;
6361     }
6362     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6363     if ($is_space->{$self->{nc}}) {
6364     ## Stay in the state.
6365    
6366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367     $self->{line_prev} = $self->{line};
6368     $self->{column_prev} = $self->{column};
6369     $self->{column}++;
6370     $self->{nc}
6371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372     } else {
6373     $self->{set_nc}->($self);
6374     }
6375    
6376     redo A;
6377     } elsif ($self->{nc} == 0x003E) { # >
6378     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6379    
6380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6381     $self->{line_prev} = $self->{line};
6382     $self->{column_prev} = $self->{column};
6383     $self->{column}++;
6384     $self->{nc}
6385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6386     } else {
6387     $self->{set_nc}->($self);
6388     }
6389    
6390     return ($self->{ct}); # ATTLIST
6391     redo A;
6392     } elsif ($self->{nc} == -1) {
6393     ## XML5: No parse error.
6394     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6395     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6396 wakaba 1.15 return ($self->{ct});
6397 wakaba 1.14 redo A;
6398     } else {
6399     ## XML5: Not defined yet.
6400 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6401     tokens => [],
6402     line => $self->{line}, column => $self->{column}};
6403     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6404    
6405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6406     $self->{line_prev} = $self->{line};
6407     $self->{column_prev} = $self->{column};
6408     $self->{column}++;
6409     $self->{nc}
6410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6411     } else {
6412     $self->{set_nc}->($self);
6413     }
6414    
6415     redo A;
6416     }
6417     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6418     if ($is_space->{$self->{nc}}) {
6419     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6420    
6421     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6422     $self->{line_prev} = $self->{line};
6423     $self->{column_prev} = $self->{column};
6424     $self->{column}++;
6425     $self->{nc}
6426     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6427     } else {
6428     $self->{set_nc}->($self);
6429     }
6430    
6431     redo A;
6432     } elsif ($self->{nc} == 0x003E) { # >
6433     ## XML5: Same as "anything else".
6434     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6435     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6436    
6437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6438     $self->{line_prev} = $self->{line};
6439     $self->{column_prev} = $self->{column};
6440     $self->{column}++;
6441     $self->{nc}
6442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6443     } else {
6444     $self->{set_nc}->($self);
6445     }
6446    
6447     return ($self->{ct}); # ATTLIST
6448     redo A;
6449     } elsif ($self->{nc} == 0x0028) { # (
6450     ## XML5: Same as "anything else".
6451     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6452     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6453    
6454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6455     $self->{line_prev} = $self->{line};
6456     $self->{column_prev} = $self->{column};
6457     $self->{column}++;
6458     $self->{nc}
6459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6460     } else {
6461     $self->{set_nc}->($self);
6462     }
6463    
6464     redo A;
6465     } elsif ($self->{nc} == -1) {
6466     ## XML5: No parse error.
6467     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6468     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6469    
6470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6471     $self->{line_prev} = $self->{line};
6472     $self->{column_prev} = $self->{column};
6473     $self->{column}++;
6474     $self->{nc}
6475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6476     } else {
6477     $self->{set_nc}->($self);
6478     }
6479    
6480     return ($self->{ct}); # ATTLIST
6481     redo A;
6482     } else {
6483     ## XML5: Not defined yet.
6484     $self->{ca}->{name} .= chr $self->{nc};
6485     ## Stay in the state.
6486    
6487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6488     $self->{line_prev} = $self->{line};
6489     $self->{column_prev} = $self->{column};
6490     $self->{column}++;
6491     $self->{nc}
6492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6493     } else {
6494     $self->{set_nc}->($self);
6495     }
6496    
6497 wakaba 1.14 redo A;
6498     }
6499 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6500     if ($is_space->{$self->{nc}}) {
6501     ## Stay in the state.
6502    
6503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6504     $self->{line_prev} = $self->{line};
6505     $self->{column_prev} = $self->{column};
6506     $self->{column}++;
6507     $self->{nc}
6508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6509     } else {
6510     $self->{set_nc}->($self);
6511     }
6512    
6513     redo A;
6514     } elsif ($self->{nc} == 0x003E) { # >
6515     ## XML5: Same as "anything else".
6516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6517     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6518    
6519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6520     $self->{line_prev} = $self->{line};
6521     $self->{column_prev} = $self->{column};
6522     $self->{column}++;
6523     $self->{nc}
6524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6525     } else {
6526     $self->{set_nc}->($self);
6527     }
6528    
6529     return ($self->{ct}); # ATTLIST
6530     redo A;
6531     } elsif ($self->{nc} == 0x0028) { # (
6532     ## XML5: Same as "anything else".
6533     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6534    
6535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536     $self->{line_prev} = $self->{line};
6537     $self->{column_prev} = $self->{column};
6538     $self->{column}++;
6539     $self->{nc}
6540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541     } else {
6542     $self->{set_nc}->($self);
6543     }
6544    
6545     redo A;
6546     } elsif ($self->{nc} == -1) {
6547     ## XML5: No parse error.
6548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6550    
6551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6552     $self->{line_prev} = $self->{line};
6553     $self->{column_prev} = $self->{column};
6554     $self->{column}++;
6555     $self->{nc}
6556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6557     } else {
6558     $self->{set_nc}->($self);
6559     }
6560    
6561     return ($self->{ct});
6562     redo A;
6563     } else {
6564     ## XML5: Not defined yet.
6565     $self->{ca}->{type} = chr $self->{nc};
6566     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6567    
6568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6569     $self->{line_prev} = $self->{line};
6570     $self->{column_prev} = $self->{column};
6571     $self->{column}++;
6572     $self->{nc}
6573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6574     } else {
6575     $self->{set_nc}->($self);
6576     }
6577    
6578     redo A;
6579     }
6580     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6581     if ($is_space->{$self->{nc}}) {
6582     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6583    
6584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6585     $self->{line_prev} = $self->{line};
6586     $self->{column_prev} = $self->{column};
6587     $self->{column}++;
6588     $self->{nc}
6589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6590     } else {
6591     $self->{set_nc}->($self);
6592     }
6593    
6594     redo A;
6595     } elsif ($self->{nc} == 0x0023) { # #
6596     ## XML5: Same as "anything else".
6597     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6598     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6599    
6600     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6601     $self->{line_prev} = $self->{line};
6602     $self->{column_prev} = $self->{column};
6603     $self->{column}++;
6604     $self->{nc}
6605     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6606     } else {
6607     $self->{set_nc}->($self);
6608     }
6609    
6610     redo A;
6611     } elsif ($self->{nc} == 0x0022) { # "
6612     ## XML5: Same as "anything else".
6613     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6614     $self->{ca}->{value} = '';
6615     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6616    
6617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6618     $self->{line_prev} = $self->{line};
6619     $self->{column_prev} = $self->{column};
6620     $self->{column}++;
6621     $self->{nc}
6622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6623     } else {
6624     $self->{set_nc}->($self);
6625     }
6626    
6627     redo A;
6628     } elsif ($self->{nc} == 0x0027) { # '
6629     ## XML5: Same as "anything else".
6630     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6631     $self->{ca}->{value} = '';
6632     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6633    
6634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6635     $self->{line_prev} = $self->{line};
6636     $self->{column_prev} = $self->{column};
6637     $self->{column}++;
6638     $self->{nc}
6639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6640     } else {
6641     $self->{set_nc}->($self);
6642     }
6643    
6644     redo A;
6645     } elsif ($self->{nc} == 0x003E) { # >
6646     ## XML5: Same as "anything else".
6647     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6648     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6649    
6650     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6651     $self->{line_prev} = $self->{line};
6652     $self->{column_prev} = $self->{column};
6653     $self->{column}++;
6654     $self->{nc}
6655     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6656     } else {
6657     $self->{set_nc}->($self);
6658     }
6659    
6660     return ($self->{ct}); # ATTLIST
6661     redo A;
6662     } elsif ($self->{nc} == 0x0028) { # (
6663     ## XML5: Same as "anything else".
6664     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6665     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6666    
6667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6668     $self->{line_prev} = $self->{line};
6669     $self->{column_prev} = $self->{column};
6670     $self->{column}++;
6671     $self->{nc}
6672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6673     } else {
6674     $self->{set_nc}->($self);
6675     }
6676    
6677     redo A;
6678     } elsif ($self->{nc} == -1) {
6679     ## XML5: No parse error.
6680     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6681     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6682    
6683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6684     $self->{line_prev} = $self->{line};
6685     $self->{column_prev} = $self->{column};
6686     $self->{column}++;
6687     $self->{nc}
6688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6689     } else {
6690     $self->{set_nc}->($self);
6691     }
6692    
6693     return ($self->{ct});
6694     redo A;
6695     } else {
6696     ## XML5: Not defined yet.
6697     $self->{ca}->{type} .= chr $self->{nc};
6698     ## Stay in the state.
6699    
6700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6701     $self->{line_prev} = $self->{line};
6702     $self->{column_prev} = $self->{column};
6703     $self->{column}++;
6704     $self->{nc}
6705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6706     } else {
6707     $self->{set_nc}->($self);
6708     }
6709    
6710     redo A;
6711     }
6712     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6713     if ($is_space->{$self->{nc}}) {
6714     ## Stay in the state.
6715    
6716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6717     $self->{line_prev} = $self->{line};
6718     $self->{column_prev} = $self->{column};
6719     $self->{column}++;
6720     $self->{nc}
6721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6722     } else {
6723     $self->{set_nc}->($self);
6724     }
6725    
6726     redo A;
6727     } elsif ($self->{nc} == 0x0028) { # (
6728     ## XML5: Same as "anything else".
6729     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6730    
6731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732     $self->{line_prev} = $self->{line};
6733     $self->{column_prev} = $self->{column};
6734     $self->{column}++;
6735     $self->{nc}
6736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737     } else {
6738     $self->{set_nc}->($self);
6739     }
6740    
6741     redo A;
6742     } elsif ($self->{nc} == 0x0023) { # #
6743     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6744    
6745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6746     $self->{line_prev} = $self->{line};
6747     $self->{column_prev} = $self->{column};
6748     $self->{column}++;
6749     $self->{nc}
6750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6751     } else {
6752     $self->{set_nc}->($self);
6753     }
6754    
6755     redo A;
6756     } elsif ($self->{nc} == 0x0022) { # "
6757     ## XML5: Same as "anything else".
6758     $self->{ca}->{value} = '';
6759     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6760    
6761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6762     $self->{line_prev} = $self->{line};
6763     $self->{column_prev} = $self->{column};
6764     $self->{column}++;
6765     $self->{nc}
6766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6767     } else {
6768     $self->{set_nc}->($self);
6769     }
6770    
6771     redo A;
6772     } elsif ($self->{nc} == 0x0027) { # '
6773     ## XML5: Same as "anything else".
6774     $self->{ca}->{value} = '';
6775     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6776    
6777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6778     $self->{line_prev} = $self->{line};
6779     $self->{column_prev} = $self->{column};
6780     $self->{column}++;
6781     $self->{nc}
6782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6783     } else {
6784     $self->{set_nc}->($self);
6785     }
6786    
6787     redo A;
6788     } elsif ($self->{nc} == 0x003E) { # >
6789     ## XML5: Same as "anything else".
6790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6791     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6792    
6793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794     $self->{line_prev} = $self->{line};
6795     $self->{column_prev} = $self->{column};
6796     $self->{column}++;
6797     $self->{nc}
6798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799     } else {
6800     $self->{set_nc}->($self);
6801     }
6802    
6803     return ($self->{ct}); # ATTLIST
6804     redo A;
6805     } elsif ($self->{nc} == -1) {
6806     ## XML5: No parse error.
6807     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6808     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6809    
6810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6811     $self->{line_prev} = $self->{line};
6812     $self->{column_prev} = $self->{column};
6813     $self->{column}++;
6814     $self->{nc}
6815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6816     } else {
6817     $self->{set_nc}->($self);
6818     }
6819    
6820     return ($self->{ct});
6821     redo A;
6822     } else {
6823     ## XML5: Switch to the "DOCTYPE bogus comment state".
6824     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6825     $self->{ca}->{value} = '';
6826     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6827     ## Reconsume.
6828     redo A;
6829     }
6830     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6831     if ($is_space->{$self->{nc}}) {
6832     ## Stay in the state.
6833    
6834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835     $self->{line_prev} = $self->{line};
6836     $self->{column_prev} = $self->{column};
6837     $self->{column}++;
6838     $self->{nc}
6839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840     } else {
6841     $self->{set_nc}->($self);
6842     }
6843    
6844     redo A;
6845     } elsif ($self->{nc} == 0x007C) { # |
6846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6847     ## Stay in the state.
6848    
6849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6850     $self->{line_prev} = $self->{line};
6851     $self->{column_prev} = $self->{column};
6852     $self->{column}++;
6853     $self->{nc}
6854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6855     } else {
6856     $self->{set_nc}->($self);
6857     }
6858    
6859     redo A;
6860     } elsif ($self->{nc} == 0x0029) { # )
6861     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6862     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6863    
6864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6865     $self->{line_prev} = $self->{line};
6866     $self->{column_prev} = $self->{column};
6867     $self->{column}++;
6868     $self->{nc}
6869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6870     } else {
6871     $self->{set_nc}->($self);
6872     }
6873    
6874     redo A;
6875     } elsif ($self->{nc} == 0x003E) { # >
6876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6877     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6878    
6879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6880     $self->{line_prev} = $self->{line};
6881     $self->{column_prev} = $self->{column};
6882     $self->{column}++;
6883     $self->{nc}
6884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6885     } else {
6886     $self->{set_nc}->($self);
6887     }
6888    
6889     return ($self->{ct}); # ATTLIST
6890     redo A;
6891     } elsif ($self->{nc} == -1) {
6892     ## XML5: No parse error.
6893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6894     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6895    
6896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6897     $self->{line_prev} = $self->{line};
6898     $self->{column_prev} = $self->{column};
6899     $self->{column}++;
6900     $self->{nc}
6901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6902     } else {
6903     $self->{set_nc}->($self);
6904     }
6905    
6906     return ($self->{ct});
6907     redo A;
6908     } else {
6909     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6910     $self->{state} = ALLOWED_TOKEN_STATE;
6911    
6912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6913     $self->{line_prev} = $self->{line};
6914     $self->{column_prev} = $self->{column};
6915     $self->{column}++;
6916     $self->{nc}
6917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6918     } else {
6919     $self->{set_nc}->($self);
6920     }
6921    
6922     redo A;
6923     }
6924     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6925     if ($is_space->{$self->{nc}}) {
6926     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6927    
6928     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6929     $self->{line_prev} = $self->{line};
6930     $self->{column_prev} = $self->{column};
6931     $self->{column}++;
6932     $self->{nc}
6933     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6934     } else {
6935     $self->{set_nc}->($self);
6936     }
6937    
6938     redo A;
6939     } elsif ($self->{nc} == 0x007C) { # |
6940     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6941    
6942     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6943     $self->{line_prev} = $self->{line};
6944     $self->{column_prev} = $self->{column};
6945     $self->{column}++;
6946     $self->{nc}
6947     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6948     } else {
6949     $self->{set_nc}->($self);
6950     }
6951    
6952     redo A;
6953     } elsif ($self->{nc} == 0x0029) { # )
6954     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6955    
6956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6957     $self->{line_prev} = $self->{line};
6958     $self->{column_prev} = $self->{column};
6959     $self->{column}++;
6960     $self->{nc}
6961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6962     } else {
6963     $self->{set_nc}->($self);
6964     }
6965    
6966     redo A;
6967     } elsif ($self->{nc} == 0x003E) { # >
6968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6970    
6971     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6972     $self->{line_prev} = $self->{line};
6973     $self->{column_prev} = $self->{column};
6974     $self->{column}++;
6975     $self->{nc}
6976     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6977     } else {
6978     $self->{set_nc}->($self);
6979     }
6980    
6981     return ($self->{ct}); # ATTLIST
6982     redo A;
6983     } elsif ($self->{nc} == -1) {
6984     ## XML5: No parse error.
6985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6986     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6987    
6988     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6989     $self->{line_prev} = $self->{line};
6990     $self->{column_prev} = $self->{column};
6991     $self->{column}++;
6992     $self->{nc}
6993     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6994     } else {
6995     $self->{set_nc}->($self);
6996     }
6997    
6998     return ($self->{ct});
6999     redo A;
7000     } else {
7001     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7002     ## Stay in the state.
7003    
7004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005     $self->{line_prev} = $self->{line};
7006     $self->{column_prev} = $self->{column};
7007     $self->{column}++;
7008     $self->{nc}
7009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010     } else {
7011     $self->{set_nc}->($self);
7012     }
7013    
7014     redo A;
7015     }
7016     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7017     if ($is_space->{$self->{nc}}) {
7018     ## Stay in the state.
7019    
7020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7021     $self->{line_prev} = $self->{line};
7022     $self->{column_prev} = $self->{column};
7023     $self->{column}++;
7024     $self->{nc}
7025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7026     } else {
7027     $self->{set_nc}->($self);
7028     }
7029    
7030     redo A;
7031     } elsif ($self->{nc} == 0x007C) { # |
7032     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7033    
7034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7035     $self->{line_prev} = $self->{line};
7036     $self->{column_prev} = $self->{column};
7037     $self->{column}++;
7038     $self->{nc}
7039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7040     } else {
7041     $self->{set_nc}->($self);
7042     }
7043    
7044     redo A;
7045     } elsif ($self->{nc} == 0x0029) { # )
7046     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7047    
7048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7049     $self->{line_prev} = $self->{line};
7050     $self->{column_prev} = $self->{column};
7051     $self->{column}++;
7052     $self->{nc}
7053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7054     } else {
7055     $self->{set_nc}->($self);
7056     }
7057    
7058     redo A;
7059     } elsif ($self->{nc} == 0x003E) { # >
7060     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7061     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7062    
7063     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7064     $self->{line_prev} = $self->{line};
7065     $self->{column_prev} = $self->{column};
7066     $self->{column}++;
7067     $self->{nc}
7068     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7069     } else {
7070     $self->{set_nc}->($self);
7071     }
7072    
7073     return ($self->{ct}); # ATTLIST
7074     redo A;
7075     } elsif ($self->{nc} == -1) {
7076     ## XML5: No parse error.
7077     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7078     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7079    
7080     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7081     $self->{line_prev} = $self->{line};
7082     $self->{column_prev} = $self->{column};
7083     $self->{column}++;
7084     $self->{nc}
7085     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7086     } else {
7087     $self->{set_nc}->($self);
7088     }
7089    
7090     return ($self->{ct});
7091     redo A;
7092     } else {
7093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7094     line => $self->{line_prev},
7095     column => $self->{column_prev});
7096     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7097     $self->{state} = ALLOWED_TOKEN_STATE;
7098    
7099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7100     $self->{line_prev} = $self->{line};
7101     $self->{column_prev} = $self->{column};
7102     $self->{column}++;
7103     $self->{nc}
7104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7105     } else {
7106     $self->{set_nc}->($self);
7107     }
7108    
7109     redo A;
7110     }
7111     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7112     if ($is_space->{$self->{nc}}) {
7113     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7114    
7115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116     $self->{line_prev} = $self->{line};
7117     $self->{column_prev} = $self->{column};
7118     $self->{column}++;
7119     $self->{nc}
7120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121     } else {
7122     $self->{set_nc}->($self);
7123     }
7124    
7125     redo A;
7126     } elsif ($self->{nc} == 0x0023) { # #
7127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7128     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7129    
7130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131     $self->{line_prev} = $self->{line};
7132     $self->{column_prev} = $self->{column};
7133     $self->{column}++;
7134     $self->{nc}
7135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136     } else {
7137     $self->{set_nc}->($self);
7138     }
7139    
7140     redo A;
7141     } elsif ($self->{nc} == 0x0022) { # "
7142     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7143     $self->{ca}->{value} = '';
7144     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7145    
7146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147     $self->{line_prev} = $self->{line};
7148     $self->{column_prev} = $self->{column};
7149     $self->{column}++;
7150     $self->{nc}
7151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152     } else {
7153     $self->{set_nc}->($self);
7154     }
7155    
7156     redo A;
7157     } elsif ($self->{nc} == 0x0027) { # '
7158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7159     $self->{ca}->{value} = '';
7160     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7161    
7162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7163     $self->{line_prev} = $self->{line};
7164     $self->{column_prev} = $self->{column};
7165     $self->{column}++;
7166     $self->{nc}
7167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7168     } else {
7169     $self->{set_nc}->($self);
7170     }
7171    
7172     redo A;
7173     } elsif ($self->{nc} == 0x003E) { # >
7174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7175     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7176    
7177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7178     $self->{line_prev} = $self->{line};
7179     $self->{column_prev} = $self->{column};
7180     $self->{column}++;
7181     $self->{nc}
7182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7183     } else {
7184     $self->{set_nc}->($self);
7185     }
7186    
7187     return ($self->{ct}); # ATTLIST
7188     redo A;
7189     } elsif ($self->{nc} == -1) {
7190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7192    
7193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7194     $self->{line_prev} = $self->{line};
7195     $self->{column_prev} = $self->{column};
7196     $self->{column}++;
7197     $self->{nc}
7198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7199     } else {
7200     $self->{set_nc}->($self);
7201     }
7202    
7203     return ($self->{ct});
7204     redo A;
7205     } else {
7206     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7207     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7208     ## Reconsume.
7209     redo A;
7210     }
7211     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7212     if ($is_space->{$self->{nc}}) {
7213     ## Stay in the state.
7214    
7215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7216     $self->{line_prev} = $self->{line};
7217     $self->{column_prev} = $self->{column};
7218     $self->{column}++;
7219     $self->{nc}
7220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7221     } else {
7222     $self->{set_nc}->($self);
7223     }
7224    
7225     redo A;
7226     } elsif ($self->{nc} == 0x0023) { # #
7227     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7228    
7229     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7230     $self->{line_prev} = $self->{line};
7231     $self->{column_prev} = $self->{column};
7232     $self->{column}++;
7233     $self->{nc}
7234     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7235     } else {
7236     $self->{set_nc}->($self);
7237     }
7238    
7239     redo A;
7240     } elsif ($self->{nc} == 0x0022) { # "
7241     $self->{ca}->{value} = '';
7242     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7243    
7244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7245     $self->{line_prev} = $self->{line};
7246     $self->{column_prev} = $self->{column};
7247     $self->{column}++;
7248     $self->{nc}
7249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7250     } else {
7251     $self->{set_nc}->($self);
7252     }
7253    
7254     redo A;
7255     } elsif ($self->{nc} == 0x0027) { # '
7256     $self->{ca}->{value} = '';
7257     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7258    
7259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7260     $self->{line_prev} = $self->{line};
7261     $self->{column_prev} = $self->{column};
7262     $self->{column}++;
7263     $self->{nc}
7264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7265     } else {
7266     $self->{set_nc}->($self);
7267     }
7268    
7269     redo A;
7270     } elsif ($self->{nc} == 0x003E) { # >
7271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7272     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7273    
7274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7275     $self->{line_prev} = $self->{line};
7276     $self->{column_prev} = $self->{column};
7277     $self->{column}++;
7278     $self->{nc}
7279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7280     } else {
7281     $self->{set_nc}->($self);
7282     }
7283    
7284     return ($self->{ct}); # ATTLIST
7285     redo A;
7286     } elsif ($self->{nc} == -1) {
7287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7289    
7290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7291     $self->{line_prev} = $self->{line};
7292     $self->{column_prev} = $self->{column};
7293     $self->{column}++;
7294     $self->{nc}
7295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7296     } else {
7297     $self->{set_nc}->($self);
7298     }
7299    
7300     return ($self->{ct});
7301     redo A;
7302     } else {
7303     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7304     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7305     ## Reconsume.
7306     redo A;
7307     }
7308     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7309     if ($is_space->{$self->{nc}}) {
7310     ## XML5: No parse error.
7311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7312 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7313 wakaba 1.15 ## Reconsume.
7314     redo A;
7315     } elsif ($self->{nc} == 0x0022) { # "
7316     ## XML5: Same as "anything else".
7317     $self->{ca}->{value} = '';
7318     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7319    
7320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7321     $self->{line_prev} = $self->{line};
7322     $self->{column_prev} = $self->{column};
7323     $self->{column}++;
7324     $self->{nc}
7325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7326     } else {
7327     $self->{set_nc}->($self);
7328     }
7329    
7330     redo A;
7331     } elsif ($self->{nc} == 0x0027) { # '
7332     ## XML5: Same as "anything else".
7333     $self->{ca}->{value} = '';
7334     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7335    
7336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337     $self->{line_prev} = $self->{line};
7338     $self->{column_prev} = $self->{column};
7339     $self->{column}++;
7340     $self->{nc}
7341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342     } else {
7343     $self->{set_nc}->($self);
7344     }
7345    
7346     redo A;
7347     } elsif ($self->{nc} == 0x003E) { # >
7348     ## XML5: Same as "anything else".
7349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7350     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7351    
7352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7353     $self->{line_prev} = $self->{line};
7354     $self->{column_prev} = $self->{column};
7355     $self->{column}++;
7356     $self->{nc}
7357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7358     } else {
7359     $self->{set_nc}->($self);
7360     }
7361    
7362     return ($self->{ct}); # ATTLIST
7363     redo A;
7364     } elsif ($self->{nc} == -1) {
7365     ## XML5: No parse error.
7366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7367     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7368    
7369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7370     $self->{line_prev} = $self->{line};
7371     $self->{column_prev} = $self->{column};
7372     $self->{column}++;
7373     $self->{nc}
7374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7375     } else {
7376     $self->{set_nc}->($self);
7377     }
7378    
7379     return ($self->{ct});
7380     redo A;
7381     } else {
7382     $self->{ca}->{default} = chr $self->{nc};
7383     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7384    
7385     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7386     $self->{line_prev} = $self->{line};
7387     $self->{column_prev} = $self->{column};
7388     $self->{column}++;
7389     $self->{nc}
7390     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7391     } else {
7392     $self->{set_nc}->($self);
7393     }
7394    
7395     redo A;
7396     }
7397     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7398     if ($is_space->{$self->{nc}}) {
7399     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7400    
7401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402     $self->{line_prev} = $self->{line};
7403     $self->{column_prev} = $self->{column};
7404     $self->{column}++;
7405     $self->{nc}
7406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407     } else {
7408     $self->{set_nc}->($self);
7409     }
7410    
7411     redo A;
7412     } elsif ($self->{nc} == 0x0022) { # "
7413     ## XML5: Same as "anything else".
7414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7415     $self->{ca}->{value} = '';
7416     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7417    
7418     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7419     $self->{line_prev} = $self->{line};
7420     $self->{column_prev} = $self->{column};
7421     $self->{column}++;
7422     $self->{nc}
7423     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7424     } else {
7425     $self->{set_nc}->($self);
7426     }
7427    
7428     redo A;
7429     } elsif ($self->{nc} == 0x0027) { # '
7430     ## XML5: Same as "anything else".
7431     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7432     $self->{ca}->{value} = '';
7433     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7434    
7435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7436     $self->{line_prev} = $self->{line};
7437     $self->{column_prev} = $self->{column};
7438     $self->{column}++;
7439     $self->{nc}
7440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7441     } else {
7442     $self->{set_nc}->($self);
7443     }
7444    
7445     redo A;
7446     } elsif ($self->{nc} == 0x003E) { # >
7447     ## XML5: Same as "anything else".
7448     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7449     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7450    
7451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7452     $self->{line_prev} = $self->{line};
7453     $self->{column_prev} = $self->{column};
7454     $self->{column}++;
7455     $self->{nc}
7456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7457     } else {
7458     $self->{set_nc}->($self);
7459     }
7460    
7461     return ($self->{ct}); # ATTLIST
7462     redo A;
7463     } elsif ($self->{nc} == -1) {
7464     ## XML5: No parse error.
7465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7466     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7467     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7468    
7469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470     $self->{line_prev} = $self->{line};
7471     $self->{column_prev} = $self->{column};
7472     $self->{column}++;
7473     $self->{nc}
7474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475     } else {
7476     $self->{set_nc}->($self);
7477     }
7478    
7479     return ($self->{ct});
7480     redo A;
7481     } else {
7482     $self->{ca}->{default} .= chr $self->{nc};
7483     ## Stay in the state.
7484    
7485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7486     $self->{line_prev} = $self->{line};
7487     $self->{column_prev} = $self->{column};
7488     $self->{column}++;
7489     $self->{nc}
7490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7491     } else {
7492     $self->{set_nc}->($self);
7493     }
7494    
7495     redo A;
7496     }
7497     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7498     if ($is_space->{$self->{nc}}) {
7499     ## Stay in the state.
7500    
7501     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7502     $self->{line_prev} = $self->{line};
7503     $self->{column_prev} = $self->{column};
7504     $self->{column}++;
7505     $self->{nc}
7506     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7507     } else {
7508     $self->{set_nc}->($self);
7509     }
7510    
7511     redo A;
7512     } elsif ($self->{nc} == 0x0022) { # "
7513     $self->{ca}->{value} = '';
7514     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7515    
7516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7517     $self->{line_prev} = $self->{line};
7518     $self->{column_prev} = $self->{column};
7519     $self->{column}++;
7520     $self->{nc}
7521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7522     } else {
7523     $self->{set_nc}->($self);
7524     }
7525    
7526     redo A;
7527     } elsif ($self->{nc} == 0x0027) { # '
7528     $self->{ca}->{value} = '';
7529     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7530    
7531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7532     $self->{line_prev} = $self->{line};
7533     $self->{column_prev} = $self->{column};
7534     $self->{column}++;
7535     $self->{nc}
7536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7537     } else {
7538     $self->{set_nc}->($self);
7539     }
7540    
7541     redo A;
7542     } elsif ($self->{nc} == 0x003E) { # >
7543     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545    
7546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547     $self->{line_prev} = $self->{line};
7548     $self->{column_prev} = $self->{column};
7549     $self->{column}++;
7550     $self->{nc}
7551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552     } else {
7553     $self->{set_nc}->($self);
7554     }
7555    
7556     return ($self->{ct}); # ATTLIST
7557     redo A;
7558     } elsif ($self->{nc} == -1) {
7559     ## XML5: No parse error.
7560     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7561     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7562     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7563    
7564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7565     $self->{line_prev} = $self->{line};
7566     $self->{column_prev} = $self->{column};
7567     $self->{column}++;
7568     $self->{nc}
7569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7570     } else {
7571     $self->{set_nc}->($self);
7572     }
7573    
7574     return ($self->{ct});
7575     redo A;
7576     } else {
7577     ## XML5: Not defined yet.
7578     if ($self->{ca}->{default} eq 'FIXED') {
7579     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7580     } else {
7581     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7582     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7583     }
7584     ## Reconsume.
7585     redo A;
7586     }
7587     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7588     if ($is_space->{$self->{nc}} or
7589     $self->{nc} == -1 or
7590     $self->{nc} == 0x003E) { # >
7591     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7592     ## Reconsume.
7593     redo A;
7594     } else {
7595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7596     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7597     ## Reconsume.
7598     redo A;
7599 wakaba 1.16 }
7600 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7601     ## ASCII case-insensitive
7602     if ($self->{nc} == [
7603     undef,
7604     0x0044, # D
7605     0x0041, # A
7606     0x0054, # T
7607     ]->[length $self->{kwd}] or
7608     $self->{nc} == [
7609     undef,
7610     0x0064, # d
7611     0x0061, # a
7612     0x0074, # t
7613     ]->[length $self->{kwd}]) {
7614    
7615     ## Stay in the state.
7616     $self->{kwd} .= chr $self->{nc};
7617    
7618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619     $self->{line_prev} = $self->{line};
7620     $self->{column_prev} = $self->{column};
7621     $self->{column}++;
7622     $self->{nc}
7623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624     } else {
7625     $self->{set_nc}->($self);
7626     }
7627    
7628     redo A;
7629     } elsif ((length $self->{kwd}) == 4 and
7630     ($self->{nc} == 0x0041 or # A
7631     $self->{nc} == 0x0061)) { # a
7632     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7633    
7634     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7635     text => 'NDATA',
7636     line => $self->{line_prev},
7637     column => $self->{column_prev} - 4);
7638     } else {
7639    
7640     }
7641     $self->{state} = AFTER_NDATA_STATE;
7642    
7643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7644     $self->{line_prev} = $self->{line};
7645     $self->{column_prev} = $self->{column};
7646     $self->{column}++;
7647     $self->{nc}
7648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7649     } else {
7650     $self->{set_nc}->($self);
7651     }
7652    
7653     redo A;
7654     } else {
7655     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7656     line => $self->{line_prev},
7657     column => $self->{column_prev} + 1
7658     - length $self->{kwd});
7659    
7660     $self->{state} = BOGUS_MD_STATE;
7661     ## Reconsume.
7662     redo A;
7663     }
7664     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7665     if ($is_space->{$self->{nc}}) {
7666     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7667    
7668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7669     $self->{line_prev} = $self->{line};
7670     $self->{column_prev} = $self->{column};
7671     $self->{column}++;
7672     $self->{nc}
7673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7674     } else {
7675     $self->{set_nc}->($self);
7676     }
7677    
7678     redo A;
7679     } elsif ($self->{nc} == 0x003E) { # >
7680     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7681     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7682    
7683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7684     $self->{line_prev} = $self->{line};
7685     $self->{column_prev} = $self->{column};
7686     $self->{column}++;
7687     $self->{nc}
7688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7689     } else {
7690     $self->{set_nc}->($self);
7691     }
7692    
7693     return ($self->{ct}); # ENTITY
7694     redo A;
7695     } elsif ($self->{nc} == -1) {
7696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7697     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7698    
7699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7700     $self->{line_prev} = $self->{line};
7701     $self->{column_prev} = $self->{column};
7702     $self->{column}++;
7703     $self->{nc}
7704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7705     } else {
7706     $self->{set_nc}->($self);
7707     }
7708    
7709     return ($self->{ct}); # ENTITY
7710     redo A;
7711     } else {
7712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7713     line => $self->{line_prev},
7714     column => $self->{column_prev} + 1
7715     - length $self->{kwd});
7716     $self->{state} = BOGUS_MD_STATE;
7717     ## Reconsume.
7718     redo A;
7719     }
7720     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7721     if ($is_space->{$self->{nc}}) {
7722     ## Stay in the state.
7723    
7724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725     $self->{line_prev} = $self->{line};
7726     $self->{column_prev} = $self->{column};
7727     $self->{column}++;
7728     $self->{nc}
7729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730     } else {
7731     $self->{set_nc}->($self);
7732     }
7733    
7734     redo A;
7735     } elsif ($self->{nc} == 0x003E) { # >
7736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7737     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7738    
7739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740     $self->{line_prev} = $self->{line};
7741     $self->{column_prev} = $self->{column};
7742     $self->{column}++;
7743     $self->{nc}
7744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745     } else {
7746     $self->{set_nc}->($self);
7747     }
7748    
7749     return ($self->{ct}); # ENTITY
7750     redo A;
7751     } elsif ($self->{nc} == -1) {
7752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7754    
7755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7756     $self->{line_prev} = $self->{line};
7757     $self->{column_prev} = $self->{column};
7758     $self->{column}++;
7759     $self->{nc}
7760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7761     } else {
7762     $self->{set_nc}->($self);
7763     }
7764    
7765     return ($self->{ct}); # ENTITY
7766     redo A;
7767     } else {
7768     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7769     $self->{state} = NOTATION_NAME_STATE;
7770    
7771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772     $self->{line_prev} = $self->{line};
7773     $self->{column_prev} = $self->{column};
7774     $self->{column}++;
7775     $self->{nc}
7776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777     } else {
7778     $self->{set_nc}->($self);
7779     }
7780    
7781     redo A;
7782     }
7783     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7784     if ($is_space->{$self->{nc}}) {
7785 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7786 wakaba 1.18
7787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788     $self->{line_prev} = $self->{line};
7789     $self->{column_prev} = $self->{column};
7790     $self->{column}++;
7791     $self->{nc}
7792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793     } else {
7794     $self->{set_nc}->($self);
7795     }
7796    
7797     redo A;
7798     } elsif ($self->{nc} == 0x003E) { # >
7799     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7800    
7801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7802     $self->{line_prev} = $self->{line};
7803     $self->{column_prev} = $self->{column};
7804     $self->{column}++;
7805     $self->{nc}
7806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7807     } else {
7808     $self->{set_nc}->($self);
7809     }
7810    
7811     return ($self->{ct}); # ENTITY
7812     redo A;
7813     } elsif ($self->{nc} == -1) {
7814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7815     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7816    
7817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7818     $self->{line_prev} = $self->{line};
7819     $self->{column_prev} = $self->{column};
7820     $self->{column}++;
7821     $self->{nc}
7822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7823     } else {
7824     $self->{set_nc}->($self);
7825     }
7826    
7827     return ($self->{ct}); # ENTITY
7828     redo A;
7829     } else {
7830     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7831     ## Stay in the state.
7832    
7833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7834     $self->{line_prev} = $self->{line};
7835     $self->{column_prev} = $self->{column};
7836     $self->{column}++;
7837     $self->{nc}
7838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7839     } else {
7840     $self->{set_nc}->($self);
7841     }
7842    
7843     redo A;
7844     }
7845 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7846     if ($self->{nc} == 0x0022) { # "
7847 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7848 wakaba 1.19
7849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7850     $self->{line_prev} = $self->{line};
7851     $self->{column_prev} = $self->{column};
7852     $self->{column}++;
7853     $self->{nc}
7854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7855     } else {
7856     $self->{set_nc}->($self);
7857     }
7858    
7859     redo A;
7860     } elsif ($self->{nc} == 0x0026) { # &
7861     $self->{prev_state} = $self->{state};
7862     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7863     $self->{entity_add} = 0x0022; # "
7864    
7865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7866     $self->{line_prev} = $self->{line};
7867     $self->{column_prev} = $self->{column};
7868     $self->{column}++;
7869     $self->{nc}
7870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7871     } else {
7872     $self->{set_nc}->($self);
7873     }
7874    
7875     redo A;
7876     ## TODO: %
7877     } elsif ($self->{nc} == -1) {
7878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7879     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7880     ## Reconsume.
7881     return ($self->{ct}); # ENTITY
7882     redo A;
7883     } else {
7884     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7885    
7886     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7887     $self->{line_prev} = $self->{line};
7888     $self->{column_prev} = $self->{column};
7889     $self->{column}++;
7890     $self->{nc}
7891     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7892     } else {
7893     $self->{set_nc}->($self);
7894     }
7895    
7896     redo A;
7897     }
7898     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7899     if ($self->{nc} == 0x0027) { # '
7900 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7901 wakaba 1.19
7902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7903     $self->{line_prev} = $self->{line};
7904     $self->{column_prev} = $self->{column};
7905     $self->{column}++;
7906     $self->{nc}
7907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7908     } else {
7909     $self->{set_nc}->($self);
7910     }
7911    
7912     redo A;
7913     } elsif ($self->{nc} == 0x0026) { # &
7914     $self->{prev_state} = $self->{state};
7915     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7916     $self->{entity_add} = 0x0027; # '
7917    
7918     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7919     $self->{line_prev} = $self->{line};
7920     $self->{column_prev} = $self->{column};
7921     $self->{column}++;
7922     $self->{nc}
7923     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7924     } else {
7925     $self->{set_nc}->($self);
7926     }
7927    
7928     redo A;
7929     ## TODO: %
7930     } elsif ($self->{nc} == -1) {
7931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7932     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7933     ## Reconsume.
7934     return ($self->{ct}); # ENTITY
7935     redo A;
7936     } else {
7937     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7938    
7939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7940     $self->{line_prev} = $self->{line};
7941     $self->{column_prev} = $self->{column};
7942     $self->{column}++;
7943     $self->{nc}
7944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7945     } else {
7946     $self->{set_nc}->($self);
7947     }
7948    
7949     redo A;
7950     }
7951     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7952     if ($is_space->{$self->{nc}} or
7953     {
7954     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7955     $self->{entity_add} => 1,
7956     }->{$self->{nc}}) {
7957 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7958     line => $self->{line_prev},
7959     column => $self->{column_prev}
7960     + ($self->{nc} == -1 ? 1 : 0));
7961 wakaba 1.19 ## Don't consume
7962     ## Return nothing.
7963     #
7964     } elsif ($self->{nc} == 0x0023) { # #
7965     $self->{ca} = $self->{ct};
7966     $self->{state} = ENTITY_HASH_STATE;
7967     $self->{kwd} = '#';
7968    
7969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7970     $self->{line_prev} = $self->{line};
7971     $self->{column_prev} = $self->{column};
7972     $self->{column}++;
7973     $self->{nc}
7974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7975     } else {
7976     $self->{set_nc}->($self);
7977     }
7978    
7979     redo A;
7980     } else {
7981     #
7982     }
7983    
7984     $self->{ct}->{value} .= '&';
7985     $self->{state} = $self->{prev_state};
7986     ## Reconsume.
7987     redo A;
7988 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7989     if ($is_space->{$self->{nc}}) {
7990     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7991    
7992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7993     $self->{line_prev} = $self->{line};
7994     $self->{column_prev} = $self->{column};
7995     $self->{column}++;
7996     $self->{nc}
7997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7998     } else {
7999     $self->{set_nc}->($self);
8000     }
8001    
8002     redo A;
8003     } elsif ($self->{nc} == 0x0028) { # (
8004     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8005     $self->{ct}->{content} = ['('];
8006     $self->{group_depth} = 1;
8007    
8008     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009     $self->{line_prev} = $self->{line};
8010     $self->{column_prev} = $self->{column};
8011     $self->{column}++;
8012     $self->{nc}
8013     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014     } else {
8015     $self->{set_nc}->($self);
8016     }
8017    
8018     redo A;
8019     } elsif ($self->{nc} == 0x003E) { # >
8020     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8021     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8022    
8023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8024     $self->{line_prev} = $self->{line};
8025     $self->{column_prev} = $self->{column};
8026     $self->{column}++;
8027     $self->{nc}
8028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8029     } else {
8030     $self->{set_nc}->($self);
8031     }
8032    
8033     return ($self->{ct}); # ELEMENT
8034     redo A;
8035     } elsif ($self->{nc} == -1) {
8036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8037     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8038    
8039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8040     $self->{line_prev} = $self->{line};
8041     $self->{column_prev} = $self->{column};
8042     $self->{column}++;
8043     $self->{nc}
8044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8045     } else {
8046     $self->{set_nc}->($self);
8047     }
8048    
8049     return ($self->{ct}); # ELEMENT
8050     redo A;
8051     } else {
8052     $self->{ct}->{content} = [chr $self->{nc}];
8053     $self->{state} = CONTENT_KEYWORD_STATE;
8054    
8055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8056     $self->{line_prev} = $self->{line};
8057     $self->{column_prev} = $self->{column};
8058     $self->{column}++;
8059     $self->{nc}
8060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8061     } else {
8062     $self->{set_nc}->($self);
8063     }
8064    
8065     redo A;
8066     }
8067     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8068     if ($is_space->{$self->{nc}}) {
8069     $self->{state} = AFTER_MD_DEF_STATE;
8070    
8071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8072     $self->{line_prev} = $self->{line};
8073     $self->{column_prev} = $self->{column};
8074     $self->{column}++;
8075     $self->{nc}
8076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8077     } else {
8078     $self->{set_nc}->($self);
8079     }
8080    
8081     redo A;
8082     } elsif ($self->{nc} == 0x003E) { # >
8083     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8084    
8085     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8086     $self->{line_prev} = $self->{line};
8087     $self->{column_prev} = $self->{column};
8088     $self->{column}++;
8089     $self->{nc}
8090     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8091     } else {
8092     $self->{set_nc}->($self);
8093     }
8094    
8095     return ($self->{ct}); # ELEMENT
8096     redo A;
8097     } elsif ($self->{nc} == -1) {
8098     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8099     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8100    
8101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8102     $self->{line_prev} = $self->{line};
8103     $self->{column_prev} = $self->{column};
8104     $self->{column}++;
8105     $self->{nc}
8106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8107     } else {
8108     $self->{set_nc}->($self);
8109     }
8110    
8111     return ($self->{ct}); # ELEMENT
8112     redo A;
8113     } else {
8114     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8115     ## Stay in the state.
8116    
8117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8118     $self->{line_prev} = $self->{line};
8119     $self->{column_prev} = $self->{column};
8120     $self->{column}++;
8121     $self->{nc}
8122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8123     } else {
8124     $self->{set_nc}->($self);
8125     }
8126    
8127     redo A;
8128     }
8129     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8130     if ($is_space->{$self->{nc}}) {
8131     ## Stay in the state.
8132    
8133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8134     $self->{line_prev} = $self->{line};
8135     $self->{column_prev} = $self->{column};
8136     $self->{column}++;
8137     $self->{nc}
8138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8139     } else {
8140     $self->{set_nc}->($self);
8141     }
8142    
8143     redo A;
8144     } elsif ($self->{nc} == 0x0028) { # (
8145     $self->{group_depth}++;
8146     push @{$self->{ct}->{content}}, chr $self->{nc};
8147     ## Stay in the state.
8148    
8149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8150     $self->{line_prev} = $self->{line};
8151     $self->{column_prev} = $self->{column};
8152     $self->{column}++;
8153     $self->{nc}
8154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8155     } else {
8156     $self->{set_nc}->($self);
8157     }
8158    
8159     redo A;
8160     } elsif ($self->{nc} == 0x007C or # |
8161     $self->{nc} == 0x002C) { # ,
8162     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8163     ## Stay in the state.
8164    
8165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8166     $self->{line_prev} = $self->{line};
8167     $self->{column_prev} = $self->{column};
8168     $self->{column}++;
8169     $self->{nc}
8170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8171     } else {
8172     $self->{set_nc}->($self);
8173     }
8174    
8175     redo A;
8176     } elsif ($self->{nc} == 0x0029) { # )
8177     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8178     push @{$self->{ct}->{content}}, chr $self->{nc};
8179     $self->{group_depth}--;
8180     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8181    
8182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8183     $self->{line_prev} = $self->{line};
8184     $self->{column_prev} = $self->{column};
8185     $self->{column}++;
8186     $self->{nc}
8187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8188     } else {
8189     $self->{set_nc}->($self);
8190     }
8191    
8192     redo A;
8193     } elsif ($self->{nc} == 0x003E) { # >
8194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8195     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8196     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8197    
8198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8199     $self->{line_prev} = $self->{line};
8200     $self->{column_prev} = $self->{column};
8201     $self->{column}++;
8202     $self->{nc}
8203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8204     } else {
8205     $self->{set_nc}->($self);
8206     }
8207    
8208     return ($self->{ct}); # ELEMENT
8209     redo A;
8210     } elsif ($self->{nc} == -1) {
8211     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8212     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8213     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8214    
8215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8216     $self->{line_prev} = $self->{line};
8217     $self->{column_prev} = $self->{column};
8218     $self->{column}++;
8219     $self->{nc}
8220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8221     } else {
8222     $self->{set_nc}->($self);
8223     }
8224    
8225     return ($self->{ct}); # ELEMENT
8226     redo A;
8227     } else {
8228     push @{$self->{ct}->{content}}, chr $self->{nc};
8229     $self->{state} = CM_ELEMENT_NAME_STATE;
8230    
8231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232     $self->{line_prev} = $self->{line};
8233     $self->{column_prev} = $self->{column};
8234     $self->{column}++;
8235     $self->{nc}
8236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237     } else {
8238     $self->{set_nc}->($self);
8239     }
8240    
8241     redo A;
8242     }
8243     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8244     if ($is_space->{$self->{nc}}) {
8245     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8246    
8247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248     $self->{line_prev} = $self->{line};
8249     $self->{column_prev} = $self->{column};
8250     $self->{column}++;
8251     $self->{nc}
8252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253     } else {
8254     $self->{set_nc}->($self);
8255     }
8256    
8257     redo A;
8258     } elsif ($self->{nc} == 0x002A or # *
8259     $self->{nc} == 0x002B or # +
8260     $self->{nc} == 0x003F) { # ?
8261     push @{$self->{ct}->{content}}, chr $self->{nc};
8262     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8263    
8264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8265     $self->{line_prev} = $self->{line};
8266     $self->{column_prev} = $self->{column};
8267     $self->{column}++;
8268     $self->{nc}
8269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8270     } else {
8271     $self->{set_nc}->($self);
8272     }
8273    
8274     redo A;
8275     } elsif ($self->{nc} == 0x007C or # |
8276     $self->{nc} == 0x002C) { # ,
8277     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8278     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8279    
8280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8281     $self->{line_prev} = $self->{line};
8282     $self->{column_prev} = $self->{column};
8283     $self->{column}++;
8284     $self->{nc}
8285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8286     } else {
8287     $self->{set_nc}->($self);
8288     }
8289    
8290     redo A;
8291     } elsif ($self->{nc} == 0x0029) { # )
8292     $self->{group_depth}--;
8293     push @{$self->{ct}->{content}}, chr $self->{nc};
8294     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8295    
8296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8297     $self->{line_prev} = $self->{line};
8298     $self->{column_prev} = $self->{column};
8299     $self->{column}++;
8300     $self->{nc}
8301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8302     } else {
8303     $self->{set_nc}->($self);
8304     }
8305    
8306     redo A;
8307     } elsif ($self->{nc} == 0x003E) { # >
8308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8309     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8310     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8311    
8312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8313     $self->{line_prev} = $self->{line};
8314     $self->{column_prev} = $self->{column};
8315     $self->{column}++;
8316     $self->{nc}
8317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8318     } else {
8319     $self->{set_nc}->($self);
8320     }
8321    
8322     return ($self->{ct}); # ELEMENT
8323     redo A;
8324     } elsif ($self->{nc} == -1) {
8325     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8326     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8327     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8328    
8329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8330     $self->{line_prev} = $self->{line};
8331     $self->{column_prev} = $self->{column};
8332     $self->{column}++;
8333     $self->{nc}
8334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8335     } else {
8336     $self->{set_nc}->($self);
8337     }
8338    
8339     return ($self->{ct}); # ELEMENT
8340     redo A;
8341     } else {
8342     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8343     ## Stay in the state.
8344    
8345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8346     $self->{line_prev} = $self->{line};
8347     $self->{column_prev} = $self->{column};
8348     $self->{column}++;
8349     $self->{nc}
8350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8351     } else {
8352     $self->{set_nc}->($self);
8353     }
8354    
8355     redo A;
8356     }
8357     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8358     if ($is_space->{$self->{nc}}) {
8359     ## Stay in the state.
8360    
8361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8362     $self->{line_prev} = $self->{line};
8363     $self->{column_prev} = $self->{column};
8364     $self->{column}++;
8365     $self->{nc}
8366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8367     } else {
8368     $self->{set_nc}->($self);
8369     }
8370    
8371     redo A;
8372     } elsif ($self->{nc} == 0x007C or # |
8373     $self->{nc} == 0x002C) { # ,
8374     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8375     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8376    
8377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8378     $self->{line_prev} = $self->{line};
8379     $self->{column_prev} = $self->{column};
8380     $self->{column}++;
8381     $self->{nc}
8382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8383     } else {
8384     $self->{set_nc}->($self);
8385     }
8386    
8387     redo A;
8388     } elsif ($self->{nc} == 0x0029) { # )
8389     $self->{group_depth}--;
8390     push @{$self->{ct}->{content}}, chr $self->{nc};
8391     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8392    
8393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8394     $self->{line_prev} = $self->{line};
8395     $self->{column_prev} = $self->{column};
8396     $self->{column}++;
8397     $self->{nc}
8398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8399     } else {
8400     $self->{set_nc}->($self);
8401     }
8402    
8403     redo A;
8404     } elsif ($self->{nc} == 0x003E) { # >
8405     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8406     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8407     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8408    
8409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8410     $self->{line_prev} = $self->{line};
8411     $self->{column_prev} = $self->{column};
8412     $self->{column}++;
8413     $self->{nc}
8414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8415     } else {
8416     $self->{set_nc}->($self);
8417     }
8418    
8419     return ($self->{ct}); # ELEMENT
8420     redo A;
8421     } elsif ($self->{nc} == -1) {
8422     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8423     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8424     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8425    
8426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8427     $self->{line_prev} = $self->{line};
8428     $self->{column_prev} = $self->{column};
8429     $self->{column}++;
8430     $self->{nc}
8431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8432     } else {
8433     $self->{set_nc}->($self);
8434     }
8435    
8436     return ($self->{ct}); # ELEMENT
8437     redo A;
8438     } else {
8439     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8440     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8441     $self->{state} = BOGUS_MD_STATE;
8442    
8443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8444     $self->{line_prev} = $self->{line};
8445     $self->{column_prev} = $self->{column};
8446     $self->{column}++;
8447     $self->{nc}
8448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8449     } else {
8450     $self->{set_nc}->($self);
8451     }
8452    
8453     redo A;
8454     }
8455     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8456     if ($is_space->{$self->{nc}}) {
8457     if ($self->{group_depth}) {
8458     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8459     } else {
8460     $self->{state} = AFTER_MD_DEF_STATE;
8461     }
8462    
8463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8464     $self->{line_prev} = $self->{line};
8465     $self->{column_prev} = $self->{column};
8466     $self->{column}++;
8467     $self->{nc}
8468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8469     } else {
8470     $self->{set_nc}->($self);
8471     }
8472    
8473     redo A;
8474     } elsif ($self->{nc} == 0x002A or # *
8475     $self->{nc} == 0x002B or # +
8476     $self->{nc} == 0x003F) { # ?
8477     push @{$self->{ct}->{content}}, chr $self->{nc};
8478     if ($self->{group_depth}) {
8479     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8480     } else {
8481     $self->{state} = AFTER_MD_DEF_STATE;
8482     }
8483    
8484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485     $self->{line_prev} = $self->{line};
8486     $self->{column_prev} = $self->{column};
8487     $self->{column}++;
8488     $self->{nc}
8489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490     } else {
8491     $self->{set_nc}->($self);
8492     }
8493    
8494     redo A;
8495     } elsif ($self->{nc} == 0x0029) { # )
8496     if ($self->{group_depth}) {
8497     $self->{group_depth}--;
8498     push @{$self->{ct}->{content}}, chr $self->{nc};
8499     ## Stay in the state.
8500    
8501     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8502     $self->{line_prev} = $self->{line};
8503     $self->{column_prev} = $self->{column};
8504     $self->{column}++;
8505     $self->{nc}
8506     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8507     } else {
8508     $self->{set_nc}->($self);
8509     }
8510    
8511     redo A;
8512     } else {
8513     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8514     $self->{state} = BOGUS_MD_STATE;
8515     ## Reconsume.
8516     redo A;
8517     }
8518     } elsif ($self->{nc} == 0x003E) { # >
8519     if ($self->{group_depth}) {
8520     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8521     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8522     }
8523     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8524    
8525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8526     $self->{line_prev} = $self->{line};
8527     $self->{column_prev} = $self->{column};
8528     $self->{column}++;
8529     $self->{nc}
8530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8531     } else {
8532     $self->{set_nc}->($self);
8533     }
8534    
8535     return ($self->{ct}); # ELEMENT
8536     redo A;
8537     } elsif ($self->{nc} == -1) {
8538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8539     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8541    
8542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543     $self->{line_prev} = $self->{line};
8544     $self->{column_prev} = $self->{column};
8545     $self->{column}++;
8546     $self->{nc}
8547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548     } else {
8549     $self->{set_nc}->($self);
8550     }
8551    
8552     return ($self->{ct}); # ELEMENT
8553     redo A;
8554     } else {
8555     if ($self->{group_depth}) {
8556     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8557     } else {
8558     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8559     $self->{state} = BOGUS_MD_STATE;
8560     }
8561     ## Reconsume.
8562     redo A;
8563     }
8564     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8565 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8566     ## Stay in the state.
8567    
8568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8569     $self->{line_prev} = $self->{line};
8570     $self->{column_prev} = $self->{column};
8571     $self->{column}++;
8572     $self->{nc}
8573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8574     } else {
8575     $self->{set_nc}->($self);
8576     }
8577    
8578     redo A;
8579     } elsif ($self->{nc} == 0x003E) { # >
8580     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8581    
8582     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8583     $self->{line_prev} = $self->{line};
8584     $self->{column_prev} = $self->{column};
8585     $self->{column}++;
8586     $self->{nc}
8587     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8588     } else {
8589     $self->{set_nc}->($self);
8590     }
8591    
8592 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8593 wakaba 1.18 redo A;
8594     } elsif ($self->{nc} == -1) {
8595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8597    
8598     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8599     $self->{line_prev} = $self->{line};
8600     $self->{column_prev} = $self->{column};
8601     $self->{column}++;
8602     $self->{nc}
8603     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8604     } else {
8605     $self->{set_nc}->($self);
8606     }
8607    
8608 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8609 wakaba 1.18 redo A;
8610     } else {
8611 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8612 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8613     ## Reconsume.
8614     redo A;
8615     }
8616 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8617     if ($self->{nc} == 0x003E) { # >
8618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8619    
8620     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8621     $self->{line_prev} = $self->{line};
8622     $self->{column_prev} = $self->{column};
8623     $self->{column}++;
8624     $self->{nc}
8625     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8626     } else {
8627     $self->{set_nc}->($self);
8628     }
8629    
8630     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8631     redo A;
8632     } elsif ($self->{nc} == -1) {
8633     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8634     ## Reconsume.
8635     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8636     redo A;
8637     } else {
8638     ## Stay in the state.
8639    
8640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8641     $self->{line_prev} = $self->{line};
8642     $self->{column_prev} = $self->{column};
8643     $self->{column}++;
8644     $self->{nc}
8645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8646     } else {
8647     $self->{set_nc}->($self);
8648     }
8649    
8650     redo A;
8651     }
8652 wakaba 1.1 } else {
8653     die "$0: $self->{state}: Unknown state";
8654     }
8655     } # A
8656    
8657     die "$0: _get_next_token: unexpected case";
8658     } # _get_next_token
8659    
8660     1;
8661 wakaba 1.26 ## $Date: 2008/10/19 15:17:01 $
8662 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24