/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (hide annotations) (download)
Sun Oct 19 14:05:20 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.23: +10 -2 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 14:05:17 -0000
	* attlist-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
	notations-1.dat, pis-2.dat: Unexpanded parameter entity tests are
	added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 14:03:50 -0000
	* Tokenizer.pm.src: Set the "stop_processing" flag true when a
	parameter entity occurs in a standalone="no" document.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 14:04:25 -0000
	* Parser.pm.src: Don't process ATTLIST_TOKEN and ENTITY_TOKEN if
	the "stop_processing" flag is set.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.24 our $VERSION=do{my @r=(q$Revision: 1.23 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743     if ($self->{nc} == 0x003D) { # =
1744    
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819     } elsif ($self->{nc} == -1) {
1820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822    
1823     $self->{last_stag_name} = $self->{ct}->{tag_name};
1824 wakaba 1.15
1825     $self->{state} = DATA_STATE;
1826     $self->{s_kwd} = '';
1827     ## reconsume
1828     return ($self->{ct}); # start tag
1829     redo A;
1830 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832     if ($self->{ct}->{attributes}) {
1833    
1834     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835     } else {
1836     ## NOTE: This state should never be reached.
1837    
1838     }
1839 wakaba 1.15
1840     $self->{state} = DATA_STATE;
1841     $self->{s_kwd} = '';
1842     ## reconsume
1843     return ($self->{ct}); # end tag
1844     redo A;
1845     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846     ## XML5: No parse error above; not defined yet.
1847     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849     ## Reconsume.
1850     return ($self->{ct}); # ATTLIST
1851     redo A;
1852 wakaba 1.1 } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     } else {
1856 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1857 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858    
1859     ## XML5: Not a parse error.
1860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861     } else {
1862    
1863     }
1864 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1865     $self->{read_until}->($self->{ca}->{value},
1866 wakaba 1.11 q["&<],
1867 wakaba 1.1 length $self->{ca}->{value});
1868    
1869     ## Stay in the state
1870    
1871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872     $self->{line_prev} = $self->{line};
1873     $self->{column_prev} = $self->{column};
1874     $self->{column}++;
1875     $self->{nc}
1876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877     } else {
1878     $self->{set_nc}->($self);
1879     }
1880    
1881     redo A;
1882     }
1883     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885     ## ATTLIST attribute value single quoted state".
1886 wakaba 1.11
1887 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1888 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889    
1890     ## XML5: "DOCTYPE ATTLIST name after state".
1891     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893     } else {
1894    
1895     ## XML5: "Before attribute name state" (sic).
1896     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897     }
1898 wakaba 1.1
1899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900     $self->{line_prev} = $self->{line};
1901     $self->{column_prev} = $self->{column};
1902     $self->{column}++;
1903     $self->{nc}
1904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905     } else {
1906     $self->{set_nc}->($self);
1907     }
1908    
1909     redo A;
1910     } elsif ($self->{nc} == 0x0026) { # &
1911    
1912 wakaba 1.11 ## XML5: Not defined yet.
1913    
1914 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1915     ## "entity in attribute value state". In this implementation, the
1916     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917     ## implementation of the "consume a character reference" algorithm.
1918     $self->{entity_add} = 0x0027; # '
1919     $self->{prev_state} = $self->{state};
1920     $self->{state} = ENTITY_STATE;
1921    
1922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923     $self->{line_prev} = $self->{line};
1924     $self->{column_prev} = $self->{column};
1925     $self->{column}++;
1926     $self->{nc}
1927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928     } else {
1929     $self->{set_nc}->($self);
1930     }
1931    
1932     redo A;
1933     } elsif ($self->{nc} == -1) {
1934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936    
1937     $self->{last_stag_name} = $self->{ct}->{tag_name};
1938 wakaba 1.15
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     ## reconsume
1942     return ($self->{ct}); # start tag
1943     redo A;
1944 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946     if ($self->{ct}->{attributes}) {
1947    
1948     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949     } else {
1950     ## NOTE: This state should never be reached.
1951    
1952     }
1953 wakaba 1.15
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     ## reconsume
1957     return ($self->{ct}); # end tag
1958     redo A;
1959     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960     ## XML5: No parse error above; not defined yet.
1961     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963     ## Reconsume.
1964     return ($self->{ct}); # ATTLIST
1965     redo A;
1966 wakaba 1.1 } else {
1967     die "$0: $self->{ct}->{type}: Unknown token type";
1968     }
1969     } else {
1970 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1971 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972    
1973     ## XML5: Not a parse error.
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975     } else {
1976    
1977     }
1978 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1979     $self->{read_until}->($self->{ca}->{value},
1980 wakaba 1.11 q['&<],
1981 wakaba 1.1 length $self->{ca}->{value});
1982    
1983     ## Stay in the state
1984    
1985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986     $self->{line_prev} = $self->{line};
1987     $self->{column_prev} = $self->{column};
1988     $self->{column}++;
1989     $self->{nc}
1990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991     } else {
1992     $self->{set_nc}->($self);
1993     }
1994    
1995     redo A;
1996     }
1997     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1999    
2000 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2001 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002    
2003     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005     } else {
2006    
2007     ## XML5: "Tag attribute name before state".
2008     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009     }
2010 wakaba 1.1
2011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012     $self->{line_prev} = $self->{line};
2013     $self->{column_prev} = $self->{column};
2014     $self->{column}++;
2015     $self->{nc}
2016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017     } else {
2018     $self->{set_nc}->($self);
2019     }
2020    
2021     redo A;
2022     } elsif ($self->{nc} == 0x0026) { # &
2023    
2024 wakaba 1.11
2025     ## XML5: Not defined yet.
2026    
2027 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2028     ## "entity in attribute value state". In this implementation, the
2029     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030     ## implementation of the "consume a character reference" algorithm.
2031     $self->{entity_add} = -1;
2032     $self->{prev_state} = $self->{state};
2033     $self->{state} = ENTITY_STATE;
2034    
2035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036     $self->{line_prev} = $self->{line};
2037     $self->{column_prev} = $self->{column};
2038     $self->{column}++;
2039     $self->{nc}
2040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041     } else {
2042     $self->{set_nc}->($self);
2043     }
2044    
2045     redo A;
2046     } elsif ($self->{nc} == 0x003E) { # >
2047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048    
2049     $self->{last_stag_name} = $self->{ct}->{tag_name};
2050 wakaba 1.15
2051     $self->{state} = DATA_STATE;
2052     $self->{s_kwd} = '';
2053    
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     return ($self->{ct}); # start tag
2065     redo A;
2066 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068     if ($self->{ct}->{attributes}) {
2069    
2070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071     } else {
2072     ## NOTE: This state should never be reached.
2073    
2074     }
2075 wakaba 1.15
2076     $self->{state} = DATA_STATE;
2077     $self->{s_kwd} = '';
2078    
2079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080     $self->{line_prev} = $self->{line};
2081     $self->{column_prev} = $self->{column};
2082     $self->{column}++;
2083     $self->{nc}
2084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085     } else {
2086     $self->{set_nc}->($self);
2087     }
2088    
2089     return ($self->{ct}); # end tag
2090     redo A;
2091     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094    
2095 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096     $self->{line_prev} = $self->{line};
2097     $self->{column_prev} = $self->{column};
2098     $self->{column}++;
2099     $self->{nc}
2100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101     } else {
2102     $self->{set_nc}->($self);
2103     }
2104    
2105 wakaba 1.15 return ($self->{ct}); # ATTLIST
2106     redo A;
2107     } else {
2108     die "$0: $self->{ct}->{type}: Unknown token type";
2109     }
2110 wakaba 1.1 } elsif ($self->{nc} == -1) {
2111     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112    
2113 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115 wakaba 1.15
2116     $self->{state} = DATA_STATE;
2117     $self->{s_kwd} = '';
2118     ## reconsume
2119     return ($self->{ct}); # start tag
2120     redo A;
2121 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124     if ($self->{ct}->{attributes}) {
2125    
2126     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127     } else {
2128     ## NOTE: This state should never be reached.
2129    
2130     }
2131 wakaba 1.15
2132     $self->{state} = DATA_STATE;
2133     $self->{s_kwd} = '';
2134     ## reconsume
2135     return ($self->{ct}); # end tag
2136     redo A;
2137     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141     ## Reconsume.
2142     return ($self->{ct}); # ATTLIST
2143     redo A;
2144 wakaba 1.1 } else {
2145     die "$0: $self->{ct}->{type}: Unknown token type";
2146     }
2147     } else {
2148     if ({
2149     0x0022 => 1, # "
2150     0x0027 => 1, # '
2151     0x003D => 1, # =
2152     }->{$self->{nc}}) {
2153    
2154 wakaba 1.11 ## XML5: Not a parse error.
2155 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156     } else {
2157    
2158     }
2159     $self->{ca}->{value} .= chr ($self->{nc});
2160     $self->{read_until}->($self->{ca}->{value},
2161     q["'=& >],
2162     length $self->{ca}->{value});
2163    
2164     ## Stay in the state
2165    
2166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167     $self->{line_prev} = $self->{line};
2168     $self->{column_prev} = $self->{column};
2169     $self->{column}++;
2170     $self->{nc}
2171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172     } else {
2173     $self->{set_nc}->($self);
2174     }
2175    
2176     redo A;
2177     }
2178     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179     if ($is_space->{$self->{nc}}) {
2180    
2181     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182    
2183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184     $self->{line_prev} = $self->{line};
2185     $self->{column_prev} = $self->{column};
2186     $self->{column}++;
2187     $self->{nc}
2188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189     } else {
2190     $self->{set_nc}->($self);
2191     }
2192    
2193     redo A;
2194     } elsif ($self->{nc} == 0x003E) { # >
2195     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196    
2197     $self->{last_stag_name} = $self->{ct}->{tag_name};
2198     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200     if ($self->{ct}->{attributes}) {
2201    
2202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203     } else {
2204     ## NOTE: This state should never be reached.
2205    
2206     }
2207     } else {
2208     die "$0: $self->{ct}->{type}: Unknown token type";
2209     }
2210     $self->{state} = DATA_STATE;
2211 wakaba 1.5 $self->{s_kwd} = '';
2212 wakaba 1.1
2213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214     $self->{line_prev} = $self->{line};
2215     $self->{column_prev} = $self->{column};
2216     $self->{column}++;
2217     $self->{nc}
2218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219     } else {
2220     $self->{set_nc}->($self);
2221     }
2222    
2223    
2224     return ($self->{ct}); # start tag or end tag
2225    
2226     redo A;
2227     } elsif ($self->{nc} == 0x002F) { # /
2228    
2229     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230    
2231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232     $self->{line_prev} = $self->{line};
2233     $self->{column_prev} = $self->{column};
2234     $self->{column}++;
2235     $self->{nc}
2236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237     } else {
2238     $self->{set_nc}->($self);
2239     }
2240    
2241     redo A;
2242     } elsif ($self->{nc} == -1) {
2243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245    
2246     $self->{last_stag_name} = $self->{ct}->{tag_name};
2247     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248     if ($self->{ct}->{attributes}) {
2249    
2250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251     } else {
2252     ## NOTE: This state should never be reached.
2253    
2254     }
2255     } else {
2256     die "$0: $self->{ct}->{type}: Unknown token type";
2257     }
2258     $self->{state} = DATA_STATE;
2259 wakaba 1.5 $self->{s_kwd} = '';
2260 wakaba 1.1 ## Reconsume.
2261     return ($self->{ct}); # start tag or end tag
2262     redo A;
2263     } else {
2264    
2265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267     ## reconsume
2268     redo A;
2269     }
2270     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 wakaba 1.11 ## XML5: "Empty tag state".
2272    
2273 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2274     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275    
2276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277     ## TODO: Different type than slash in start tag
2278     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279     if ($self->{ct}->{attributes}) {
2280    
2281     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282     } else {
2283    
2284     }
2285     ## TODO: Test |<title></title/>|
2286     } else {
2287    
2288     $self->{self_closing} = 1;
2289     }
2290    
2291     $self->{state} = DATA_STATE;
2292 wakaba 1.5 $self->{s_kwd} = '';
2293 wakaba 1.1
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304    
2305     return ($self->{ct}); # start tag or end tag
2306    
2307     redo A;
2308     } elsif ($self->{nc} == -1) {
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311    
2312     $self->{last_stag_name} = $self->{ct}->{tag_name};
2313     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318     ## NOTE: This state should never be reached.
2319    
2320     }
2321     } else {
2322     die "$0: $self->{ct}->{type}: Unknown token type";
2323     }
2324 wakaba 1.11 ## XML5: "Tag attribute name before state".
2325 wakaba 1.1 $self->{state} = DATA_STATE;
2326 wakaba 1.5 $self->{s_kwd} = '';
2327 wakaba 1.1 ## Reconsume.
2328     return ($self->{ct}); # start tag or end tag
2329     redo A;
2330     } else {
2331    
2332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333     ## TODO: This error type is wrong.
2334     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335     ## Reconsume.
2336     redo A;
2337     }
2338     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340    
2341 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342     ## consumes characters one-by-one basis.
2343    
2344     if ($self->{nc} == 0x003E) { # >
2345 wakaba 1.13 if ($self->{in_subset}) {
2346    
2347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348     } else {
2349    
2350     $self->{state} = DATA_STATE;
2351     $self->{s_kwd} = '';
2352     }
2353 wakaba 1.1
2354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355     $self->{line_prev} = $self->{line};
2356     $self->{column_prev} = $self->{column};
2357     $self->{column}++;
2358     $self->{nc}
2359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360     } else {
2361     $self->{set_nc}->($self);
2362     }
2363    
2364    
2365     return ($self->{ct}); # comment
2366     redo A;
2367     } elsif ($self->{nc} == -1) {
2368 wakaba 1.13 if ($self->{in_subset}) {
2369    
2370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371     } else {
2372    
2373     $self->{state} = DATA_STATE;
2374     $self->{s_kwd} = '';
2375     }
2376 wakaba 1.1 ## reconsume
2377    
2378     return ($self->{ct}); # comment
2379     redo A;
2380     } else {
2381    
2382     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383     $self->{read_until}->($self->{ct}->{data},
2384     q[>],
2385     length $self->{ct}->{data});
2386    
2387     ## Stay in the state.
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399     redo A;
2400     }
2401     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 wakaba 1.14 ## XML5: "Markup declaration state".
2403 wakaba 1.1
2404     if ($self->{nc} == 0x002D) { # -
2405    
2406     $self->{state} = MD_HYPHEN_STATE;
2407    
2408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409     $self->{line_prev} = $self->{line};
2410     $self->{column_prev} = $self->{column};
2411     $self->{column}++;
2412     $self->{nc}
2413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414     } else {
2415     $self->{set_nc}->($self);
2416     }
2417    
2418     redo A;
2419     } elsif ($self->{nc} == 0x0044 or # D
2420     $self->{nc} == 0x0064) { # d
2421     ## ASCII case-insensitive.
2422    
2423     $self->{state} = MD_DOCTYPE_STATE;
2424 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2425 wakaba 1.1
2426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427     $self->{line_prev} = $self->{line};
2428     $self->{column_prev} = $self->{column};
2429     $self->{column}++;
2430     $self->{nc}
2431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432     } else {
2433     $self->{set_nc}->($self);
2434     }
2435    
2436     redo A;
2437 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439     $self->{is_xml}) and
2440 wakaba 1.1 $self->{nc} == 0x005B) { # [
2441    
2442     $self->{state} = MD_CDATA_STATE;
2443 wakaba 1.12 $self->{kwd} = '[';
2444 wakaba 1.1
2445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446     $self->{line_prev} = $self->{line};
2447     $self->{column_prev} = $self->{column};
2448     $self->{column}++;
2449     $self->{nc}
2450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451     } else {
2452     $self->{set_nc}->($self);
2453     }
2454    
2455     redo A;
2456     } else {
2457    
2458     }
2459    
2460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461     line => $self->{line_prev},
2462     column => $self->{column_prev} - 1);
2463     ## Reconsume.
2464     $self->{state} = BOGUS_COMMENT_STATE;
2465     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466     line => $self->{line_prev},
2467     column => $self->{column_prev} - 1,
2468     };
2469     redo A;
2470     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471     if ($self->{nc} == 0x002D) { # -
2472    
2473     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474     line => $self->{line_prev},
2475     column => $self->{column_prev} - 2,
2476     };
2477 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478 wakaba 1.1
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } else {
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493     line => $self->{line_prev},
2494     column => $self->{column_prev} - 2);
2495     $self->{state} = BOGUS_COMMENT_STATE;
2496     ## Reconsume.
2497     $self->{ct} = {type => COMMENT_TOKEN,
2498     data => '-',
2499     line => $self->{line_prev},
2500     column => $self->{column_prev} - 2,
2501     };
2502     redo A;
2503     }
2504     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505     ## ASCII case-insensitive.
2506     if ($self->{nc} == [
2507     undef,
2508     0x004F, # O
2509     0x0043, # C
2510     0x0054, # T
2511     0x0059, # Y
2512     0x0050, # P
2513 wakaba 1.12 ]->[length $self->{kwd}] or
2514 wakaba 1.1 $self->{nc} == [
2515     undef,
2516     0x006F, # o
2517     0x0063, # c
2518     0x0074, # t
2519     0x0079, # y
2520     0x0070, # p
2521 wakaba 1.12 ]->[length $self->{kwd}]) {
2522 wakaba 1.1
2523     ## Stay in the state.
2524 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2525 wakaba 1.1
2526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527     $self->{line_prev} = $self->{line};
2528     $self->{column_prev} = $self->{column};
2529     $self->{column}++;
2530     $self->{nc}
2531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532     } else {
2533     $self->{set_nc}->($self);
2534     }
2535    
2536     redo A;
2537 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2538 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2539     $self->{nc} == 0x0065)) { # e
2540 wakaba 1.12 if ($self->{is_xml} and
2541     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542 wakaba 1.10
2543     ## XML5: case-sensitive.
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545     text => 'DOCTYPE',
2546     line => $self->{line_prev},
2547     column => $self->{column_prev} - 5);
2548     } else {
2549    
2550     }
2551 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2552     $self->{ct} = {type => DOCTYPE_TOKEN,
2553     quirks => 1,
2554     line => $self->{line_prev},
2555     column => $self->{column_prev} - 7,
2556     };
2557    
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568     redo A;
2569     } else {
2570    
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572     line => $self->{line_prev},
2573 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2575     ## Reconsume.
2576     $self->{ct} = {type => COMMENT_TOKEN,
2577 wakaba 1.12 data => $self->{kwd},
2578 wakaba 1.1 line => $self->{line_prev},
2579 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 wakaba 1.1 };
2581     redo A;
2582     }
2583     } elsif ($self->{state} == MD_CDATA_STATE) {
2584     if ($self->{nc} == {
2585     '[' => 0x0043, # C
2586     '[C' => 0x0044, # D
2587     '[CD' => 0x0041, # A
2588     '[CDA' => 0x0054, # T
2589     '[CDAT' => 0x0041, # A
2590 wakaba 1.12 }->{$self->{kwd}}) {
2591 wakaba 1.1
2592     ## Stay in the state.
2593 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2594 wakaba 1.1
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605     redo A;
2606 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2607 wakaba 1.1 $self->{nc} == 0x005B) { # [
2608 wakaba 1.6 if ($self->{is_xml} and
2609     not $self->{tainted} and
2610     @{$self->{open_elements} or []} == 0) {
2611 wakaba 1.8
2612 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613     line => $self->{line_prev},
2614     column => $self->{column_prev} - 7);
2615     $self->{tainted} = 1;
2616 wakaba 1.8 } else {
2617    
2618 wakaba 1.6 }
2619    
2620 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2621     data => '',
2622     line => $self->{line_prev},
2623     column => $self->{column_prev} - 7};
2624     $self->{state} = CDATA_SECTION_STATE;
2625    
2626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627     $self->{line_prev} = $self->{line};
2628     $self->{column_prev} = $self->{column};
2629     $self->{column}++;
2630     $self->{nc}
2631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632     } else {
2633     $self->{set_nc}->($self);
2634     }
2635    
2636     redo A;
2637     } else {
2638    
2639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640     line => $self->{line_prev},
2641 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2643     ## Reconsume.
2644     $self->{ct} = {type => COMMENT_TOKEN,
2645 wakaba 1.12 data => $self->{kwd},
2646 wakaba 1.1 line => $self->{line_prev},
2647 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 wakaba 1.1 };
2649     redo A;
2650     }
2651     } elsif ($self->{state} == COMMENT_START_STATE) {
2652     if ($self->{nc} == 0x002D) { # -
2653    
2654     $self->{state} = COMMENT_START_DASH_STATE;
2655    
2656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657     $self->{line_prev} = $self->{line};
2658     $self->{column_prev} = $self->{column};
2659     $self->{column}++;
2660     $self->{nc}
2661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662     } else {
2663     $self->{set_nc}->($self);
2664     }
2665    
2666     redo A;
2667     } elsif ($self->{nc} == 0x003E) { # >
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 wakaba 1.13 if ($self->{in_subset}) {
2670    
2671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672     } else {
2673    
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     }
2677 wakaba 1.1
2678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679     $self->{line_prev} = $self->{line};
2680     $self->{column_prev} = $self->{column};
2681     $self->{column}++;
2682     $self->{nc}
2683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684     } else {
2685     $self->{set_nc}->($self);
2686     }
2687    
2688    
2689     return ($self->{ct}); # comment
2690    
2691     redo A;
2692     } elsif ($self->{nc} == -1) {
2693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 wakaba 1.13 if ($self->{in_subset}) {
2695    
2696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697     } else {
2698    
2699     $self->{state} = DATA_STATE;
2700     $self->{s_kwd} = '';
2701     }
2702 wakaba 1.1 ## reconsume
2703    
2704     return ($self->{ct}); # comment
2705    
2706     redo A;
2707     } else {
2708    
2709     $self->{ct}->{data} # comment
2710     .= chr ($self->{nc});
2711     $self->{state} = COMMENT_STATE;
2712    
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723     redo A;
2724     }
2725     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726     if ($self->{nc} == 0x002D) { # -
2727    
2728     $self->{state} = COMMENT_END_STATE;
2729    
2730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731     $self->{line_prev} = $self->{line};
2732     $self->{column_prev} = $self->{column};
2733     $self->{column}++;
2734     $self->{nc}
2735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736     } else {
2737     $self->{set_nc}->($self);
2738     }
2739    
2740     redo A;
2741     } elsif ($self->{nc} == 0x003E) { # >
2742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 wakaba 1.13 if ($self->{in_subset}) {
2744    
2745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746     } else {
2747    
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     }
2751 wakaba 1.1
2752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753     $self->{line_prev} = $self->{line};
2754     $self->{column_prev} = $self->{column};
2755     $self->{column}++;
2756     $self->{nc}
2757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758     } else {
2759     $self->{set_nc}->($self);
2760     }
2761    
2762    
2763     return ($self->{ct}); # comment
2764    
2765     redo A;
2766     } elsif ($self->{nc} == -1) {
2767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 wakaba 1.13 if ($self->{in_subset}) {
2769    
2770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771     } else {
2772    
2773     $self->{state} = DATA_STATE;
2774     $self->{s_kwd} = '';
2775     }
2776 wakaba 1.1 ## reconsume
2777    
2778     return ($self->{ct}); # comment
2779    
2780     redo A;
2781     } else {
2782    
2783     $self->{ct}->{data} # comment
2784     .= '-' . chr ($self->{nc});
2785     $self->{state} = COMMENT_STATE;
2786    
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797     redo A;
2798     }
2799     } elsif ($self->{state} == COMMENT_STATE) {
2800 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2801    
2802 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2803    
2804     $self->{state} = COMMENT_END_DASH_STATE;
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } elsif ($self->{nc} == -1) {
2818     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 wakaba 1.13 if ($self->{in_subset}) {
2820    
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     } else {
2823    
2824     $self->{state} = DATA_STATE;
2825     $self->{s_kwd} = '';
2826     }
2827 wakaba 1.1 ## reconsume
2828    
2829     return ($self->{ct}); # comment
2830    
2831     redo A;
2832     } else {
2833    
2834     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835     $self->{read_until}->($self->{ct}->{data},
2836     q[-],
2837     length $self->{ct}->{data});
2838    
2839     ## Stay in the state
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     }
2853     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855 wakaba 1.10
2856 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2857    
2858     $self->{state} = COMMENT_END_STATE;
2859    
2860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861     $self->{line_prev} = $self->{line};
2862     $self->{column_prev} = $self->{column};
2863     $self->{column}++;
2864     $self->{nc}
2865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866     } else {
2867     $self->{set_nc}->($self);
2868     }
2869    
2870     redo A;
2871     } elsif ($self->{nc} == -1) {
2872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 wakaba 1.13 if ($self->{in_subset}) {
2874    
2875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876     } else {
2877    
2878     $self->{state} = DATA_STATE;
2879     $self->{s_kwd} = '';
2880     }
2881 wakaba 1.1 ## reconsume
2882    
2883     return ($self->{ct}); # comment
2884    
2885     redo A;
2886     } else {
2887    
2888     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889     $self->{state} = COMMENT_STATE;
2890    
2891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892     $self->{line_prev} = $self->{line};
2893     $self->{column_prev} = $self->{column};
2894     $self->{column}++;
2895     $self->{nc}
2896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897     } else {
2898     $self->{set_nc}->($self);
2899     }
2900    
2901     redo A;
2902     }
2903     } elsif ($self->{state} == COMMENT_END_STATE) {
2904 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905    
2906 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2907 wakaba 1.13 if ($self->{in_subset}) {
2908    
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910     } else {
2911    
2912     $self->{state} = DATA_STATE;
2913     $self->{s_kwd} = '';
2914     }
2915 wakaba 1.1
2916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917     $self->{line_prev} = $self->{line};
2918     $self->{column_prev} = $self->{column};
2919     $self->{column}++;
2920     $self->{nc}
2921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922     } else {
2923     $self->{set_nc}->($self);
2924     }
2925    
2926    
2927     return ($self->{ct}); # comment
2928    
2929     redo A;
2930     } elsif ($self->{nc} == 0x002D) { # -
2931    
2932 wakaba 1.10 ## XML5: Not a parse error.
2933 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934     line => $self->{line_prev},
2935     column => $self->{column_prev});
2936     $self->{ct}->{data} .= '-'; # comment
2937     ## Stay in the state
2938    
2939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940     $self->{line_prev} = $self->{line};
2941     $self->{column_prev} = $self->{column};
2942     $self->{column}++;
2943     $self->{nc}
2944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945     } else {
2946     $self->{set_nc}->($self);
2947     }
2948    
2949     redo A;
2950     } elsif ($self->{nc} == -1) {
2951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1 ## reconsume
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } else {
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972     $self->{state} = COMMENT_STATE;
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     }
2986     } elsif ($self->{state} == DOCTYPE_STATE) {
2987     if ($is_space->{$self->{nc}}) {
2988    
2989     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990    
2991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002     } else {
3003    
3004 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3005 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007     ## reconsume
3008     redo A;
3009     }
3010     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3012    
3013 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3014    
3015     ## Stay in the state
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     } elsif ($self->{nc} == 0x003E) { # >
3029    
3030 wakaba 1.12 ## XML5: No parse error.
3031 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032     $self->{state} = DATA_STATE;
3033 wakaba 1.5 $self->{s_kwd} = '';
3034 wakaba 1.1
3035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036     $self->{line_prev} = $self->{line};
3037     $self->{column_prev} = $self->{column};
3038     $self->{column}++;
3039     $self->{nc}
3040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041     } else {
3042     $self->{set_nc}->($self);
3043     }
3044    
3045    
3046     return ($self->{ct}); # DOCTYPE (quirks)
3047    
3048     redo A;
3049     } elsif ($self->{nc} == -1) {
3050    
3051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052     $self->{state} = DATA_STATE;
3053 wakaba 1.5 $self->{s_kwd} = '';
3054 wakaba 1.1 ## reconsume
3055    
3056     return ($self->{ct}); # DOCTYPE (quirks)
3057    
3058     redo A;
3059 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060    
3061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064     $self->{in_subset} = 1;
3065 wakaba 1.12
3066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067     $self->{line_prev} = $self->{line};
3068     $self->{column_prev} = $self->{column};
3069     $self->{column}++;
3070     $self->{nc}
3071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072     } else {
3073     $self->{set_nc}->($self);
3074     }
3075    
3076 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3077 wakaba 1.12 redo A;
3078 wakaba 1.1 } else {
3079    
3080     $self->{ct}->{name} = chr $self->{nc};
3081     delete $self->{ct}->{quirks};
3082     $self->{state} = DOCTYPE_NAME_STATE;
3083    
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094     redo A;
3095     }
3096     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3098    
3099     ## ISSUE: Redundant "First," in the spec.
3100    
3101 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3102    
3103     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x003E) { # >
3117    
3118     $self->{state} = DATA_STATE;
3119 wakaba 1.5 $self->{s_kwd} = '';
3120 wakaba 1.1
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131    
3132     return ($self->{ct}); # DOCTYPE
3133    
3134     redo A;
3135     } elsif ($self->{nc} == -1) {
3136    
3137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138     $self->{state} = DATA_STATE;
3139 wakaba 1.5 $self->{s_kwd} = '';
3140 wakaba 1.1 ## reconsume
3141    
3142     $self->{ct}->{quirks} = 1;
3143     return ($self->{ct}); # DOCTYPE
3144    
3145     redo A;
3146 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147    
3148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150     $self->{in_subset} = 1;
3151 wakaba 1.12
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3163 wakaba 1.12 redo A;
3164 wakaba 1.1 } else {
3165    
3166     $self->{ct}->{name}
3167     .= chr ($self->{nc}); # DOCTYPE
3168     ## Stay in the state
3169    
3170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171     $self->{line_prev} = $self->{line};
3172     $self->{column_prev} = $self->{column};
3173     $self->{column}++;
3174     $self->{nc}
3175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176     } else {
3177     $self->{set_nc}->($self);
3178     }
3179    
3180     redo A;
3181     }
3182     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184     ## state", but implemented differently.
3185    
3186 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3187    
3188     ## Stay in the state
3189    
3190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191     $self->{line_prev} = $self->{line};
3192     $self->{column_prev} = $self->{column};
3193     $self->{column}++;
3194     $self->{nc}
3195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196     } else {
3197     $self->{set_nc}->($self);
3198     }
3199    
3200     redo A;
3201     } elsif ($self->{nc} == 0x003E) { # >
3202 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203    
3204     $self->{state} = DATA_STATE;
3205     $self->{s_kwd} = '';
3206     } else {
3207    
3208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210     }
3211 wakaba 1.1
3212    
3213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214     $self->{line_prev} = $self->{line};
3215     $self->{column_prev} = $self->{column};
3216     $self->{column}++;
3217     $self->{nc}
3218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219     } else {
3220     $self->{set_nc}->($self);
3221     }
3222    
3223 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 wakaba 1.1 redo A;
3225     } elsif ($self->{nc} == -1) {
3226 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227    
3228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229     $self->{state} = DATA_STATE;
3230     $self->{s_kwd} = '';
3231     $self->{ct}->{quirks} = 1;
3232     } else {
3233    
3234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236     }
3237 wakaba 1.1
3238 wakaba 1.16 ## Reconsume.
3239     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 wakaba 1.1 redo A;
3241     } elsif ($self->{nc} == 0x0050 or # P
3242     $self->{nc} == 0x0070) { # p
3243 wakaba 1.12
3244 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3245 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3246 wakaba 1.1
3247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248     $self->{line_prev} = $self->{line};
3249     $self->{column_prev} = $self->{column};
3250     $self->{column}++;
3251     $self->{nc}
3252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253     } else {
3254     $self->{set_nc}->($self);
3255     }
3256    
3257     redo A;
3258     } elsif ($self->{nc} == 0x0053 or # S
3259     $self->{nc} == 0x0073) { # s
3260 wakaba 1.12
3261 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3262 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3276     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278    
3279     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280     $self->{ct}->{value} = ''; # ENTITY
3281    
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     } elsif ($self->{nc} == 0x0027 and # '
3294     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296    
3297     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298     $self->{ct}->{value} = ''; # ENTITY
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311 wakaba 1.16 } elsif ($self->{is_xml} and
3312     $self->{ct}->{type} == DOCTYPE_TOKEN and
3313     $self->{nc} == 0x005B) { # [
3314 wakaba 1.12
3315     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 wakaba 1.13 $self->{in_subset} = 1;
3318 wakaba 1.1
3319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320     $self->{line_prev} = $self->{line};
3321     $self->{column_prev} = $self->{column};
3322     $self->{column}++;
3323     $self->{nc}
3324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325     } else {
3326     $self->{set_nc}->($self);
3327     }
3328    
3329 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3330 wakaba 1.1 redo A;
3331     } else {
3332 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333    
3334     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335    
3336     $self->{ct}->{quirks} = 1;
3337     $self->{state} = BOGUS_DOCTYPE_STATE;
3338     } else {
3339    
3340     $self->{state} = BOGUS_MD_STATE;
3341     }
3342 wakaba 1.1
3343    
3344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345     $self->{line_prev} = $self->{line};
3346     $self->{column_prev} = $self->{column};
3347     $self->{column}++;
3348     $self->{nc}
3349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350     } else {
3351     $self->{set_nc}->($self);
3352     }
3353    
3354     redo A;
3355     }
3356     } elsif ($self->{state} == PUBLIC_STATE) {
3357     ## ASCII case-insensitive
3358     if ($self->{nc} == [
3359     undef,
3360     0x0055, # U
3361     0x0042, # B
3362     0x004C, # L
3363     0x0049, # I
3364 wakaba 1.12 ]->[length $self->{kwd}] or
3365 wakaba 1.1 $self->{nc} == [
3366     undef,
3367     0x0075, # u
3368     0x0062, # b
3369     0x006C, # l
3370     0x0069, # i
3371 wakaba 1.12 ]->[length $self->{kwd}]) {
3372 wakaba 1.1
3373     ## Stay in the state.
3374 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3375 wakaba 1.1
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3388 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3389     $self->{nc} == 0x0063)) { # c
3390 wakaba 1.12 if ($self->{is_xml} and
3391     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392    
3393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394     text => 'PUBLIC',
3395     line => $self->{line_prev},
3396     column => $self->{column_prev} - 4);
3397     } else {
3398    
3399     }
3400 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     } else {
3414 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 wakaba 1.1 line => $self->{line_prev},
3416 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418    
3419     $self->{ct}->{quirks} = 1;
3420     $self->{state} = BOGUS_DOCTYPE_STATE;
3421     } else {
3422    
3423     $self->{state} = BOGUS_MD_STATE;
3424     }
3425 wakaba 1.1 ## Reconsume.
3426     redo A;
3427     }
3428     } elsif ($self->{state} == SYSTEM_STATE) {
3429     ## ASCII case-insensitive
3430     if ($self->{nc} == [
3431     undef,
3432     0x0059, # Y
3433     0x0053, # S
3434     0x0054, # T
3435     0x0045, # E
3436 wakaba 1.12 ]->[length $self->{kwd}] or
3437 wakaba 1.1 $self->{nc} == [
3438     undef,
3439     0x0079, # y
3440     0x0073, # s
3441     0x0074, # t
3442     0x0065, # e
3443 wakaba 1.12 ]->[length $self->{kwd}]) {
3444 wakaba 1.1
3445     ## Stay in the state.
3446 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3447 wakaba 1.1
3448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449     $self->{line_prev} = $self->{line};
3450     $self->{column_prev} = $self->{column};
3451     $self->{column}++;
3452     $self->{nc}
3453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454     } else {
3455     $self->{set_nc}->($self);
3456     }
3457    
3458     redo A;
3459 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3460 wakaba 1.1 ($self->{nc} == 0x004D or # M
3461     $self->{nc} == 0x006D)) { # m
3462 wakaba 1.12 if ($self->{is_xml} and
3463     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464    
3465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466     text => 'SYSTEM',
3467     line => $self->{line_prev},
3468     column => $self->{column_prev} - 4);
3469     } else {
3470    
3471     }
3472 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473    
3474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475     $self->{line_prev} = $self->{line};
3476     $self->{column_prev} = $self->{column};
3477     $self->{column}++;
3478     $self->{nc}
3479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480     } else {
3481     $self->{set_nc}->($self);
3482     }
3483    
3484     redo A;
3485     } else {
3486 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 wakaba 1.1 line => $self->{line_prev},
3488 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490    
3491     $self->{ct}->{quirks} = 1;
3492     $self->{state} = BOGUS_DOCTYPE_STATE;
3493     } else {
3494    
3495     $self->{state} = BOGUS_MD_STATE;
3496     }
3497 wakaba 1.1 ## Reconsume.
3498     redo A;
3499     }
3500     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501     if ($is_space->{$self->{nc}}) {
3502    
3503     ## Stay in the state
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } elsif ($self->{nc} eq 0x0022) { # "
3517    
3518     $self->{ct}->{pubid} = ''; # DOCTYPE
3519     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520    
3521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522     $self->{line_prev} = $self->{line};
3523     $self->{column_prev} = $self->{column};
3524     $self->{column}++;
3525     $self->{nc}
3526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527     } else {
3528     $self->{set_nc}->($self);
3529     }
3530    
3531     redo A;
3532     } elsif ($self->{nc} eq 0x0027) { # '
3533    
3534     $self->{ct}->{pubid} = ''; # DOCTYPE
3535     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536    
3537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538     $self->{line_prev} = $self->{line};
3539     $self->{column_prev} = $self->{column};
3540     $self->{column}++;
3541     $self->{nc}
3542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543     } else {
3544     $self->{set_nc}->($self);
3545     }
3546    
3547     redo A;
3548     } elsif ($self->{nc} eq 0x003E) { # >
3549 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550    
3551     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552    
3553     $self->{state} = DATA_STATE;
3554     $self->{s_kwd} = '';
3555     $self->{ct}->{quirks} = 1;
3556     } else {
3557    
3558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559     }
3560 wakaba 1.1
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 wakaba 1.1 redo A;
3574     } elsif ($self->{nc} == -1) {
3575 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576    
3577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578     $self->{state} = DATA_STATE;
3579     $self->{s_kwd} = '';
3580     $self->{ct}->{quirks} = 1;
3581     } else {
3582    
3583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585     }
3586 wakaba 1.1
3587     ## reconsume
3588     return ($self->{ct}); # DOCTYPE
3589     redo A;
3590 wakaba 1.16 } elsif ($self->{is_xml} and
3591     $self->{ct}->{type} == DOCTYPE_TOKEN and
3592     $self->{nc} == 0x005B) { # [
3593 wakaba 1.12
3594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 wakaba 1.13 $self->{in_subset} = 1;
3598 wakaba 1.12
3599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600     $self->{line_prev} = $self->{line};
3601     $self->{column_prev} = $self->{column};
3602     $self->{column}++;
3603     $self->{nc}
3604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605     } else {
3606     $self->{set_nc}->($self);
3607     }
3608    
3609 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3610 wakaba 1.12 redo A;
3611 wakaba 1.1 } else {
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613    
3614 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615    
3616     $self->{ct}->{quirks} = 1;
3617     $self->{state} = BOGUS_DOCTYPE_STATE;
3618     } else {
3619    
3620     $self->{state} = BOGUS_MD_STATE;
3621     }
3622    
3623 wakaba 1.1
3624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625     $self->{line_prev} = $self->{line};
3626     $self->{column_prev} = $self->{column};
3627     $self->{column}++;
3628     $self->{nc}
3629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630     } else {
3631     $self->{set_nc}->($self);
3632     }
3633    
3634     redo A;
3635     }
3636     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637     if ($self->{nc} == 0x0022) { # "
3638    
3639     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640    
3641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642     $self->{line_prev} = $self->{line};
3643     $self->{column_prev} = $self->{column};
3644     $self->{column}++;
3645     $self->{nc}
3646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647     } else {
3648     $self->{set_nc}->($self);
3649     }
3650    
3651     redo A;
3652     } elsif ($self->{nc} == 0x003E) { # >
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654    
3655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656    
3657     $self->{state} = DATA_STATE;
3658     $self->{s_kwd} = '';
3659     $self->{ct}->{quirks} = 1;
3660     } else {
3661    
3662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663     }
3664    
3665 wakaba 1.1
3666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667     $self->{line_prev} = $self->{line};
3668     $self->{column_prev} = $self->{column};
3669     $self->{column}++;
3670     $self->{nc}
3671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672     } else {
3673     $self->{set_nc}->($self);
3674     }
3675    
3676 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 wakaba 1.1 redo A;
3678     } elsif ($self->{nc} == -1) {
3679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680    
3681 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682    
3683     $self->{state} = DATA_STATE;
3684     $self->{s_kwd} = '';
3685     $self->{ct}->{quirks} = 1;
3686     } else {
3687    
3688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689     }
3690    
3691     ## Reconsume.
3692 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3693     redo A;
3694     } else {
3695    
3696 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698     length $self->{ct}->{pubid});
3699    
3700     ## Stay in the state
3701    
3702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703     $self->{line_prev} = $self->{line};
3704     $self->{column_prev} = $self->{column};
3705     $self->{column}++;
3706     $self->{nc}
3707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708     } else {
3709     $self->{set_nc}->($self);
3710     }
3711    
3712     redo A;
3713     }
3714     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715     if ($self->{nc} == 0x0027) { # '
3716    
3717     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718    
3719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720     $self->{line_prev} = $self->{line};
3721     $self->{column_prev} = $self->{column};
3722     $self->{column}++;
3723     $self->{nc}
3724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725     } else {
3726     $self->{set_nc}->($self);
3727     }
3728    
3729     redo A;
3730     } elsif ($self->{nc} == 0x003E) { # >
3731     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732    
3733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734    
3735     $self->{state} = DATA_STATE;
3736     $self->{s_kwd} = '';
3737     $self->{ct}->{quirks} = 1;
3738     } else {
3739    
3740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741     }
3742    
3743 wakaba 1.1
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 wakaba 1.1 redo A;
3756     } elsif ($self->{nc} == -1) {
3757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758    
3759 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760    
3761     $self->{state} = DATA_STATE;
3762     $self->{s_kwd} = '';
3763     $self->{ct}->{quirks} = 1;
3764     } else {
3765    
3766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767     }
3768    
3769 wakaba 1.1 ## reconsume
3770 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 wakaba 1.1 redo A;
3772     } else {
3773    
3774 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776     length $self->{ct}->{pubid});
3777    
3778     ## Stay in the state
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     }
3792     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793     if ($is_space->{$self->{nc}}) {
3794    
3795     ## Stay in the state
3796    
3797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798     $self->{line_prev} = $self->{line};
3799     $self->{column_prev} = $self->{column};
3800     $self->{column}++;
3801     $self->{nc}
3802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803     } else {
3804     $self->{set_nc}->($self);
3805     }
3806    
3807     redo A;
3808     } elsif ($self->{nc} == 0x0022) { # "
3809    
3810 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812    
3813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814     $self->{line_prev} = $self->{line};
3815     $self->{column_prev} = $self->{column};
3816     $self->{column}++;
3817     $self->{nc}
3818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819     } else {
3820     $self->{set_nc}->($self);
3821     }
3822    
3823     redo A;
3824     } elsif ($self->{nc} == 0x0027) { # '
3825    
3826 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828    
3829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830     $self->{line_prev} = $self->{line};
3831     $self->{column_prev} = $self->{column};
3832     $self->{column}++;
3833     $self->{nc}
3834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835     } else {
3836     $self->{set_nc}->($self);
3837     }
3838    
3839     redo A;
3840     } elsif ($self->{nc} == 0x003E) { # >
3841 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842     if ($self->{is_xml}) {
3843    
3844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845     } else {
3846    
3847     }
3848     $self->{state} = DATA_STATE;
3849     $self->{s_kwd} = '';
3850 wakaba 1.12 } else {
3851 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852    
3853     } else {
3854    
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856     }
3857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 wakaba 1.12 }
3859 wakaba 1.16
3860 wakaba 1.1
3861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862     $self->{line_prev} = $self->{line};
3863     $self->{column_prev} = $self->{column};
3864     $self->{column}++;
3865     $self->{nc}
3866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867     } else {
3868     $self->{set_nc}->($self);
3869     }
3870    
3871 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 wakaba 1.1 redo A;
3873     } elsif ($self->{nc} == -1) {
3874 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875    
3876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877    
3878     $self->{state} = DATA_STATE;
3879     $self->{s_kwd} = '';
3880     $self->{ct}->{quirks} = 1;
3881     } else {
3882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884     }
3885 wakaba 1.1
3886     ## reconsume
3887 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 wakaba 1.1 redo A;
3889 wakaba 1.16 } elsif ($self->{is_xml} and
3890     $self->{ct}->{type} == DOCTYPE_TOKEN and
3891     $self->{nc} == 0x005B) { # [
3892 wakaba 1.12
3893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 wakaba 1.13 $self->{in_subset} = 1;
3897 wakaba 1.12
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3909 wakaba 1.12 redo A;
3910 wakaba 1.1 } else {
3911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912    
3913 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914    
3915     $self->{ct}->{quirks} = 1;
3916     $self->{state} = BOGUS_DOCTYPE_STATE;
3917     } else {
3918    
3919     $self->{state} = BOGUS_MD_STATE;
3920     }
3921    
3922 wakaba 1.1
3923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924     $self->{line_prev} = $self->{line};
3925     $self->{column_prev} = $self->{column};
3926     $self->{column}++;
3927     $self->{nc}
3928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929     } else {
3930     $self->{set_nc}->($self);
3931     }
3932    
3933     redo A;
3934     }
3935     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936     if ($is_space->{$self->{nc}}) {
3937    
3938     ## Stay in the state
3939    
3940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941     $self->{line_prev} = $self->{line};
3942     $self->{column_prev} = $self->{column};
3943     $self->{column}++;
3944     $self->{nc}
3945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946     } else {
3947     $self->{set_nc}->($self);
3948     }
3949    
3950     redo A;
3951     } elsif ($self->{nc} == 0x0022) { # "
3952    
3953     $self->{ct}->{sysid} = ''; # DOCTYPE
3954     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955    
3956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957     $self->{line_prev} = $self->{line};
3958     $self->{column_prev} = $self->{column};
3959     $self->{column}++;
3960     $self->{nc}
3961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962     } else {
3963     $self->{set_nc}->($self);
3964     }
3965    
3966     redo A;
3967     } elsif ($self->{nc} == 0x0027) { # '
3968    
3969     $self->{ct}->{sysid} = ''; # DOCTYPE
3970     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971    
3972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973     $self->{line_prev} = $self->{line};
3974     $self->{column_prev} = $self->{column};
3975     $self->{column}++;
3976     $self->{nc}
3977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978     } else {
3979     $self->{set_nc}->($self);
3980     }
3981    
3982     redo A;
3983     } elsif ($self->{nc} == 0x003E) { # >
3984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987     $self->{line_prev} = $self->{line};
3988     $self->{column_prev} = $self->{column};
3989     $self->{column}++;
3990     $self->{nc}
3991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992     } else {
3993     $self->{set_nc}->($self);
3994     }
3995    
3996    
3997 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998    
3999     $self->{state} = DATA_STATE;
4000     $self->{s_kwd} = '';
4001     $self->{ct}->{quirks} = 1;
4002     } else {
4003    
4004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005     }
4006 wakaba 1.1
4007 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 wakaba 1.1 redo A;
4009     } elsif ($self->{nc} == -1) {
4010 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011    
4012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013     $self->{state} = DATA_STATE;
4014     $self->{s_kwd} = '';
4015     $self->{ct}->{quirks} = 1;
4016     } else {
4017    
4018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020     }
4021 wakaba 1.1
4022     ## reconsume
4023 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 wakaba 1.1 redo A;
4025 wakaba 1.16 } elsif ($self->{is_xml} and
4026     $self->{ct}->{type} == DOCTYPE_TOKEN and
4027     $self->{nc} == 0x005B) { # [
4028 wakaba 1.12
4029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030    
4031     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 wakaba 1.13 $self->{in_subset} = 1;
4034 wakaba 1.12
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4046 wakaba 1.12 redo A;
4047 wakaba 1.1 } else {
4048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049    
4050 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051    
4052     $self->{ct}->{quirks} = 1;
4053     $self->{state} = BOGUS_DOCTYPE_STATE;
4054     } else {
4055    
4056     $self->{state} = BOGUS_MD_STATE;
4057     }
4058    
4059 wakaba 1.1
4060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061     $self->{line_prev} = $self->{line};
4062     $self->{column_prev} = $self->{column};
4063     $self->{column}++;
4064     $self->{nc}
4065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066     } else {
4067     $self->{set_nc}->($self);
4068     }
4069    
4070     redo A;
4071     }
4072     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073     if ($self->{nc} == 0x0022) { # "
4074    
4075     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076    
4077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078     $self->{line_prev} = $self->{line};
4079     $self->{column_prev} = $self->{column};
4080     $self->{column}++;
4081     $self->{nc}
4082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083     } else {
4084     $self->{set_nc}->($self);
4085     }
4086    
4087     redo A;
4088 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090    
4091 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092    
4093     $self->{state} = DATA_STATE;
4094     $self->{s_kwd} = '';
4095     $self->{ct}->{quirks} = 1;
4096     } else {
4097    
4098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099     }
4100    
4101 wakaba 1.1
4102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103     $self->{line_prev} = $self->{line};
4104     $self->{column_prev} = $self->{column};
4105     $self->{column}++;
4106     $self->{nc}
4107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108     } else {
4109     $self->{set_nc}->($self);
4110     }
4111    
4112 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 wakaba 1.1 redo A;
4114     } elsif ($self->{nc} == -1) {
4115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116    
4117 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118    
4119     $self->{state} = DATA_STATE;
4120     $self->{s_kwd} = '';
4121     $self->{ct}->{quirks} = 1;
4122     } else {
4123    
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125     }
4126    
4127 wakaba 1.1 ## reconsume
4128 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 wakaba 1.1 redo A;
4130     } else {
4131    
4132 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134     length $self->{ct}->{sysid});
4135    
4136     ## Stay in the state
4137    
4138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139     $self->{line_prev} = $self->{line};
4140     $self->{column_prev} = $self->{column};
4141     $self->{column}++;
4142     $self->{nc}
4143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144     } else {
4145     $self->{set_nc}->($self);
4146     }
4147    
4148     redo A;
4149     }
4150     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151     if ($self->{nc} == 0x0027) { # '
4152    
4153     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154    
4155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156     $self->{line_prev} = $self->{line};
4157     $self->{column_prev} = $self->{column};
4158     $self->{column}++;
4159     $self->{nc}
4160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161     } else {
4162     $self->{set_nc}->($self);
4163     }
4164    
4165     redo A;
4166 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167 wakaba 1.1
4168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169    
4170     $self->{state} = DATA_STATE;
4171 wakaba 1.5 $self->{s_kwd} = '';
4172 wakaba 1.1
4173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174     $self->{line_prev} = $self->{line};
4175     $self->{column_prev} = $self->{column};
4176     $self->{column}++;
4177     $self->{nc}
4178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179     } else {
4180     $self->{set_nc}->($self);
4181     }
4182    
4183    
4184     $self->{ct}->{quirks} = 1;
4185     return ($self->{ct}); # DOCTYPE
4186    
4187     redo A;
4188     } elsif ($self->{nc} == -1) {
4189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190    
4191 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192    
4193     $self->{state} = DATA_STATE;
4194     $self->{s_kwd} = '';
4195     $self->{ct}->{quirks} = 1;
4196     } else {
4197    
4198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199     }
4200    
4201 wakaba 1.1 ## reconsume
4202 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 wakaba 1.1 redo A;
4204     } else {
4205    
4206 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208     length $self->{ct}->{sysid});
4209    
4210     ## Stay in the state
4211    
4212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213     $self->{line_prev} = $self->{line};
4214     $self->{column_prev} = $self->{column};
4215     $self->{column}++;
4216     $self->{nc}
4217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218     } else {
4219     $self->{set_nc}->($self);
4220     }
4221    
4222     redo A;
4223     }
4224     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225     if ($is_space->{$self->{nc}}) {
4226 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227    
4228     $self->{state} = BEFORE_NDATA_STATE;
4229     } else {
4230    
4231     ## Stay in the state
4232     }
4233 wakaba 1.1
4234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235     $self->{line_prev} = $self->{line};
4236     $self->{column_prev} = $self->{column};
4237     $self->{column}++;
4238     $self->{nc}
4239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240     } else {
4241     $self->{set_nc}->($self);
4242     }
4243    
4244     redo A;
4245     } elsif ($self->{nc} == 0x003E) { # >
4246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247    
4248     $self->{state} = DATA_STATE;
4249     $self->{s_kwd} = '';
4250     } else {
4251    
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253     }
4254    
4255 wakaba 1.1
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 wakaba 1.1 redo A;
4268 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269     ($self->{nc} == 0x004E or # N
4270     $self->{nc} == 0x006E)) { # n
4271    
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273     $self->{state} = NDATA_STATE;
4274     $self->{kwd} = chr $self->{nc};
4275    
4276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277     $self->{line_prev} = $self->{line};
4278     $self->{column_prev} = $self->{column};
4279     $self->{column}++;
4280     $self->{nc}
4281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282     } else {
4283     $self->{set_nc}->($self);
4284     }
4285    
4286     redo A;
4287 wakaba 1.1 } elsif ($self->{nc} == -1) {
4288 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289    
4290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291     $self->{state} = DATA_STATE;
4292     $self->{s_kwd} = '';
4293     $self->{ct}->{quirks} = 1;
4294     } else {
4295    
4296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298     }
4299    
4300 wakaba 1.1 ## reconsume
4301 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 wakaba 1.1 redo A;
4303 wakaba 1.16 } elsif ($self->{is_xml} and
4304     $self->{ct}->{type} == DOCTYPE_TOKEN and
4305     $self->{nc} == 0x005B) { # [
4306 wakaba 1.12
4307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 wakaba 1.13 $self->{in_subset} = 1;
4310 wakaba 1.12
4311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312     $self->{line_prev} = $self->{line};
4313     $self->{column_prev} = $self->{column};
4314     $self->{column}++;
4315     $self->{nc}
4316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317     } else {
4318     $self->{set_nc}->($self);
4319     }
4320    
4321 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4322 wakaba 1.12 redo A;
4323 wakaba 1.1 } else {
4324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325    
4326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327    
4328     #$self->{ct}->{quirks} = 1;
4329     $self->{state} = BOGUS_DOCTYPE_STATE;
4330     } else {
4331    
4332     $self->{state} = BOGUS_MD_STATE;
4333     }
4334    
4335 wakaba 1.1
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     }
4348 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349     if ($is_space->{$self->{nc}}) {
4350    
4351     ## Stay in the state.
4352    
4353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354     $self->{line_prev} = $self->{line};
4355     $self->{column_prev} = $self->{column};
4356     $self->{column}++;
4357     $self->{nc}
4358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359     } else {
4360     $self->{set_nc}->($self);
4361     }
4362    
4363     redo A;
4364     } elsif ($self->{nc} == 0x003E) { # >
4365    
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367    
4368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369     $self->{line_prev} = $self->{line};
4370     $self->{column_prev} = $self->{column};
4371     $self->{column}++;
4372     $self->{nc}
4373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374     } else {
4375     $self->{set_nc}->($self);
4376     }
4377    
4378     return ($self->{ct}); # ENTITY
4379     redo A;
4380     } elsif ($self->{nc} == 0x004E or # N
4381     $self->{nc} == 0x006E) { # n
4382    
4383     $self->{state} = NDATA_STATE;
4384     $self->{kwd} = chr $self->{nc};
4385    
4386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387     $self->{line_prev} = $self->{line};
4388     $self->{column_prev} = $self->{column};
4389     $self->{column}++;
4390     $self->{nc}
4391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392     } else {
4393     $self->{set_nc}->($self);
4394     }
4395    
4396     redo A;
4397     } elsif ($self->{nc} == -1) {
4398    
4399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401     ## reconsume
4402     return ($self->{ct}); # ENTITY
4403     redo A;
4404     } else {
4405    
4406     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407     $self->{state} = BOGUS_MD_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     }
4421 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422     if ($self->{nc} == 0x003E) { # >
4423    
4424     $self->{state} = DATA_STATE;
4425 wakaba 1.5 $self->{s_kwd} = '';
4426 wakaba 1.1
4427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428     $self->{line_prev} = $self->{line};
4429     $self->{column_prev} = $self->{column};
4430     $self->{column}++;
4431     $self->{nc}
4432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433     } else {
4434     $self->{set_nc}->($self);
4435     }
4436    
4437    
4438     return ($self->{ct}); # DOCTYPE
4439    
4440     redo A;
4441 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442 wakaba 1.13
4443     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445     $self->{in_subset} = 1;
4446    
4447 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448     $self->{line_prev} = $self->{line};
4449     $self->{column_prev} = $self->{column};
4450     $self->{column}++;
4451     $self->{nc}
4452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453     } else {
4454     $self->{set_nc}->($self);
4455     }
4456    
4457 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4458     redo A;
4459 wakaba 1.1 } elsif ($self->{nc} == -1) {
4460    
4461     $self->{state} = DATA_STATE;
4462 wakaba 1.5 $self->{s_kwd} = '';
4463 wakaba 1.1 ## reconsume
4464    
4465     return ($self->{ct}); # DOCTYPE
4466    
4467     redo A;
4468     } else {
4469    
4470     my $s = '';
4471 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4472 wakaba 1.1
4473     ## Stay in the state
4474    
4475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476     $self->{line_prev} = $self->{line};
4477     $self->{column_prev} = $self->{column};
4478     $self->{column}++;
4479     $self->{nc}
4480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481     } else {
4482     $self->{set_nc}->($self);
4483     }
4484    
4485     redo A;
4486     }
4487     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488     ## NOTE: "CDATA section state" in the state is jointly implemented
4489     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490     ## and |CDATA_SECTION_MSE2_STATE|.
4491 wakaba 1.10
4492     ## XML5: "CDATA state".
4493 wakaba 1.1
4494     if ($self->{nc} == 0x005D) { # ]
4495    
4496     $self->{state} = CDATA_SECTION_MSE1_STATE;
4497    
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510 wakaba 1.6 if ($self->{is_xml}) {
4511 wakaba 1.8
4512 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 wakaba 1.8 } else {
4514    
4515 wakaba 1.6 }
4516    
4517 wakaba 1.1 $self->{state} = DATA_STATE;
4518 wakaba 1.5 $self->{s_kwd} = '';
4519 wakaba 1.10 ## Reconsume.
4520 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4521    
4522     return ($self->{ct}); # character
4523     } else {
4524    
4525     ## No token to emit. $self->{ct} is discarded.
4526     }
4527     redo A;
4528     } else {
4529    
4530     $self->{ct}->{data} .= chr $self->{nc};
4531     $self->{read_until}->($self->{ct}->{data},
4532     q<]>,
4533     length $self->{ct}->{data});
4534    
4535     ## Stay in the state.
4536    
4537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538     $self->{line_prev} = $self->{line};
4539     $self->{column_prev} = $self->{column};
4540     $self->{column}++;
4541     $self->{nc}
4542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543     } else {
4544     $self->{set_nc}->($self);
4545     }
4546    
4547     redo A;
4548     }
4549    
4550     ## ISSUE: "text tokens" in spec.
4551     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 wakaba 1.10 ## XML5: "CDATA bracket state".
4553    
4554 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4555    
4556     $self->{state} = CDATA_SECTION_MSE2_STATE;
4557    
4558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559     $self->{line_prev} = $self->{line};
4560     $self->{column_prev} = $self->{column};
4561     $self->{column}++;
4562     $self->{nc}
4563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564     } else {
4565     $self->{set_nc}->($self);
4566     }
4567    
4568     redo A;
4569     } else {
4570    
4571 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 wakaba 1.1 $self->{ct}->{data} .= ']';
4573 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 wakaba 1.1 ## Reconsume.
4575     redo A;
4576     }
4577     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 wakaba 1.10 ## XML5: "CDATA end state".
4579    
4580 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4581     $self->{state} = DATA_STATE;
4582 wakaba 1.5 $self->{s_kwd} = '';
4583 wakaba 1.1
4584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585     $self->{line_prev} = $self->{line};
4586     $self->{column_prev} = $self->{column};
4587     $self->{column}++;
4588     $self->{nc}
4589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590     } else {
4591     $self->{set_nc}->($self);
4592     }
4593    
4594     if (length $self->{ct}->{data}) { # character
4595    
4596     return ($self->{ct}); # character
4597     } else {
4598    
4599     ## No token to emit. $self->{ct} is discarded.
4600     }
4601     redo A;
4602     } elsif ($self->{nc} == 0x005D) { # ]
4603     # character
4604     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605     ## Stay in the state.
4606    
4607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608     $self->{line_prev} = $self->{line};
4609     $self->{column_prev} = $self->{column};
4610     $self->{column}++;
4611     $self->{nc}
4612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613     } else {
4614     $self->{set_nc}->($self);
4615     }
4616    
4617     redo A;
4618     } else {
4619    
4620     $self->{ct}->{data} .= ']]'; # character
4621     $self->{state} = CDATA_SECTION_STATE;
4622 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4623 wakaba 1.1 redo A;
4624     }
4625     } elsif ($self->{state} == ENTITY_STATE) {
4626     if ($is_space->{$self->{nc}} or
4627     {
4628     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629     $self->{entity_add} => 1,
4630     }->{$self->{nc}}) {
4631 wakaba 1.22 if ($self->{is_xml}) {
4632    
4633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4634     line => $self->{line_prev},
4635     column => $self->{column_prev}
4636     + ($self->{nc} == -1 ? 1 : 0));
4637     } else {
4638    
4639     ## No error
4640     }
4641 wakaba 1.1 ## Don't consume
4642     ## Return nothing.
4643     #
4644     } elsif ($self->{nc} == 0x0023) { # #
4645    
4646     $self->{state} = ENTITY_HASH_STATE;
4647 wakaba 1.12 $self->{kwd} = '#';
4648 wakaba 1.1
4649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650     $self->{line_prev} = $self->{line};
4651     $self->{column_prev} = $self->{column};
4652     $self->{column}++;
4653     $self->{nc}
4654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655     } else {
4656     $self->{set_nc}->($self);
4657     }
4658    
4659     redo A;
4660 wakaba 1.22 } elsif ($self->{is_xml} or
4661     (0x0041 <= $self->{nc} and
4662 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4663     (0x0061 <= $self->{nc} and
4664     $self->{nc} <= 0x007A)) { # a..z
4665    
4666     require Whatpm::_NamedEntityList;
4667     $self->{state} = ENTITY_NAME_STATE;
4668 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4669     $self->{entity__value} = $self->{kwd};
4670 wakaba 1.1 $self->{entity__match} = 0;
4671    
4672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673     $self->{line_prev} = $self->{line};
4674     $self->{column_prev} = $self->{column};
4675     $self->{column}++;
4676     $self->{nc}
4677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678     } else {
4679     $self->{set_nc}->($self);
4680     }
4681    
4682     redo A;
4683     } else {
4684    
4685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4686     ## Return nothing.
4687     #
4688     }
4689    
4690     ## NOTE: No character is consumed by the "consume a character
4691     ## reference" algorithm. In other word, there is an "&" character
4692     ## that does not introduce a character reference, which would be
4693     ## appended to the parent element or the attribute value in later
4694     ## process of the tokenizer.
4695    
4696     if ($self->{prev_state} == DATA_STATE) {
4697    
4698     $self->{state} = $self->{prev_state};
4699 wakaba 1.5 $self->{s_kwd} = '';
4700 wakaba 1.1 ## Reconsume.
4701     return ({type => CHARACTER_TOKEN, data => '&',
4702     line => $self->{line_prev},
4703     column => $self->{column_prev},
4704     });
4705     redo A;
4706     } else {
4707    
4708     $self->{ca}->{value} .= '&';
4709     $self->{state} = $self->{prev_state};
4710 wakaba 1.5 $self->{s_kwd} = '';
4711 wakaba 1.1 ## Reconsume.
4712     redo A;
4713     }
4714     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4715 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4716 wakaba 1.1
4717     $self->{state} = HEXREF_X_STATE;
4718 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4719 wakaba 1.1
4720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4721     $self->{line_prev} = $self->{line};
4722     $self->{column_prev} = $self->{column};
4723     $self->{column}++;
4724     $self->{nc}
4725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4726     } else {
4727     $self->{set_nc}->($self);
4728     }
4729    
4730     redo A;
4731 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4732    
4733     if ($self->{is_xml}) {
4734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4735     }
4736     $self->{state} = HEXREF_X_STATE;
4737     $self->{kwd} .= chr $self->{nc};
4738    
4739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4740     $self->{line_prev} = $self->{line};
4741     $self->{column_prev} = $self->{column};
4742     $self->{column}++;
4743     $self->{nc}
4744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4745     } else {
4746     $self->{set_nc}->($self);
4747     }
4748    
4749     redo A;
4750 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4751     $self->{nc} <= 0x0039) { # 0..9
4752    
4753     $self->{state} = NCR_NUM_STATE;
4754 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4755 wakaba 1.1
4756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4757     $self->{line_prev} = $self->{line};
4758     $self->{column_prev} = $self->{column};
4759     $self->{column}++;
4760     $self->{nc}
4761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4762     } else {
4763     $self->{set_nc}->($self);
4764     }
4765    
4766     redo A;
4767     } else {
4768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4769     line => $self->{line_prev},
4770     column => $self->{column_prev} - 1);
4771    
4772     ## NOTE: According to the spec algorithm, nothing is returned,
4773     ## and then "&#" is appended to the parent element or the attribute
4774     ## value in the later processing.
4775    
4776     if ($self->{prev_state} == DATA_STATE) {
4777    
4778     $self->{state} = $self->{prev_state};
4779 wakaba 1.5 $self->{s_kwd} = '';
4780 wakaba 1.1 ## Reconsume.
4781     return ({type => CHARACTER_TOKEN,
4782     data => '&#',
4783     line => $self->{line_prev},
4784     column => $self->{column_prev} - 1,
4785     });
4786     redo A;
4787     } else {
4788    
4789     $self->{ca}->{value} .= '&#';
4790     $self->{state} = $self->{prev_state};
4791 wakaba 1.5 $self->{s_kwd} = '';
4792 wakaba 1.1 ## Reconsume.
4793     redo A;
4794     }
4795     }
4796     } elsif ($self->{state} == NCR_NUM_STATE) {
4797     if (0x0030 <= $self->{nc} and
4798     $self->{nc} <= 0x0039) { # 0..9
4799    
4800 wakaba 1.12 $self->{kwd} *= 10;
4801     $self->{kwd} += $self->{nc} - 0x0030;
4802 wakaba 1.1
4803     ## Stay in the state.
4804    
4805     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806     $self->{line_prev} = $self->{line};
4807     $self->{column_prev} = $self->{column};
4808     $self->{column}++;
4809     $self->{nc}
4810     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811     } else {
4812     $self->{set_nc}->($self);
4813     }
4814    
4815     redo A;
4816     } elsif ($self->{nc} == 0x003B) { # ;
4817    
4818    
4819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4820     $self->{line_prev} = $self->{line};
4821     $self->{column_prev} = $self->{column};
4822     $self->{column}++;
4823     $self->{nc}
4824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4825     } else {
4826     $self->{set_nc}->($self);
4827     }
4828    
4829     #
4830     } else {
4831    
4832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4833     ## Reconsume.
4834     #
4835     }
4836    
4837 wakaba 1.12 my $code = $self->{kwd};
4838 wakaba 1.1 my $l = $self->{line_prev};
4839     my $c = $self->{column_prev};
4840     if ($charref_map->{$code}) {
4841    
4842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4843     text => (sprintf 'U+%04X', $code),
4844     line => $l, column => $c);
4845     $code = $charref_map->{$code};
4846     } elsif ($code > 0x10FFFF) {
4847    
4848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4849     text => (sprintf 'U-%08X', $code),
4850     line => $l, column => $c);
4851     $code = 0xFFFD;
4852     }
4853    
4854     if ($self->{prev_state} == DATA_STATE) {
4855    
4856     $self->{state} = $self->{prev_state};
4857 wakaba 1.5 $self->{s_kwd} = '';
4858 wakaba 1.1 ## Reconsume.
4859     return ({type => CHARACTER_TOKEN, data => chr $code,
4860 wakaba 1.7 has_reference => 1,
4861 wakaba 1.1 line => $l, column => $c,
4862     });
4863     redo A;
4864     } else {
4865    
4866     $self->{ca}->{value} .= chr $code;
4867     $self->{ca}->{has_reference} = 1;
4868     $self->{state} = $self->{prev_state};
4869 wakaba 1.5 $self->{s_kwd} = '';
4870 wakaba 1.1 ## Reconsume.
4871     redo A;
4872     }
4873     } elsif ($self->{state} == HEXREF_X_STATE) {
4874     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4875     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4876     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4877     # 0..9, A..F, a..f
4878    
4879     $self->{state} = HEXREF_HEX_STATE;
4880 wakaba 1.12 $self->{kwd} = 0;
4881 wakaba 1.1 ## Reconsume.
4882     redo A;
4883     } else {
4884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4885     line => $self->{line_prev},
4886     column => $self->{column_prev} - 2);
4887    
4888     ## NOTE: According to the spec algorithm, nothing is returned,
4889     ## and then "&#" followed by "X" or "x" is appended to the parent
4890     ## element or the attribute value in the later processing.
4891    
4892     if ($self->{prev_state} == DATA_STATE) {
4893    
4894     $self->{state} = $self->{prev_state};
4895 wakaba 1.5 $self->{s_kwd} = '';
4896 wakaba 1.1 ## Reconsume.
4897     return ({type => CHARACTER_TOKEN,
4898 wakaba 1.12 data => '&' . $self->{kwd},
4899 wakaba 1.1 line => $self->{line_prev},
4900 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4901 wakaba 1.1 });
4902     redo A;
4903     } else {
4904    
4905 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4906 wakaba 1.1 $self->{state} = $self->{prev_state};
4907 wakaba 1.5 $self->{s_kwd} = '';
4908 wakaba 1.1 ## Reconsume.
4909     redo A;
4910     }
4911     }
4912     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4913     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4914     # 0..9
4915    
4916 wakaba 1.12 $self->{kwd} *= 0x10;
4917     $self->{kwd} += $self->{nc} - 0x0030;
4918 wakaba 1.1 ## Stay in the state.
4919    
4920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4921     $self->{line_prev} = $self->{line};
4922     $self->{column_prev} = $self->{column};
4923     $self->{column}++;
4924     $self->{nc}
4925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4926     } else {
4927     $self->{set_nc}->($self);
4928     }
4929    
4930     redo A;
4931     } elsif (0x0061 <= $self->{nc} and
4932     $self->{nc} <= 0x0066) { # a..f
4933    
4934 wakaba 1.12 $self->{kwd} *= 0x10;
4935     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4936 wakaba 1.1 ## Stay in the state.
4937    
4938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4939     $self->{line_prev} = $self->{line};
4940     $self->{column_prev} = $self->{column};
4941     $self->{column}++;
4942     $self->{nc}
4943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4944     } else {
4945     $self->{set_nc}->($self);
4946     }
4947    
4948     redo A;
4949     } elsif (0x0041 <= $self->{nc} and
4950     $self->{nc} <= 0x0046) { # A..F
4951    
4952 wakaba 1.12 $self->{kwd} *= 0x10;
4953     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4954 wakaba 1.1 ## Stay in the state.
4955    
4956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957     $self->{line_prev} = $self->{line};
4958     $self->{column_prev} = $self->{column};
4959     $self->{column}++;
4960     $self->{nc}
4961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962     } else {
4963     $self->{set_nc}->($self);
4964     }
4965    
4966     redo A;
4967     } elsif ($self->{nc} == 0x003B) { # ;
4968    
4969    
4970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4971     $self->{line_prev} = $self->{line};
4972     $self->{column_prev} = $self->{column};
4973     $self->{column}++;
4974     $self->{nc}
4975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4976     } else {
4977     $self->{set_nc}->($self);
4978     }
4979    
4980     #
4981     } else {
4982    
4983     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4984     line => $self->{line},
4985     column => $self->{column});
4986     ## Reconsume.
4987     #
4988     }
4989    
4990 wakaba 1.12 my $code = $self->{kwd};
4991 wakaba 1.1 my $l = $self->{line_prev};
4992     my $c = $self->{column_prev};
4993     if ($charref_map->{$code}) {
4994    
4995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4996     text => (sprintf 'U+%04X', $code),
4997     line => $l, column => $c);
4998     $code = $charref_map->{$code};
4999     } elsif ($code > 0x10FFFF) {
5000    
5001     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5002     text => (sprintf 'U-%08X', $code),
5003     line => $l, column => $c);
5004     $code = 0xFFFD;
5005     }
5006    
5007     if ($self->{prev_state} == DATA_STATE) {
5008    
5009     $self->{state} = $self->{prev_state};
5010 wakaba 1.5 $self->{s_kwd} = '';
5011 wakaba 1.1 ## Reconsume.
5012     return ({type => CHARACTER_TOKEN, data => chr $code,
5013 wakaba 1.7 has_reference => 1,
5014 wakaba 1.1 line => $l, column => $c,
5015     });
5016     redo A;
5017     } else {
5018    
5019     $self->{ca}->{value} .= chr $code;
5020     $self->{ca}->{has_reference} = 1;
5021     $self->{state} = $self->{prev_state};
5022 wakaba 1.5 $self->{s_kwd} = '';
5023 wakaba 1.1 ## Reconsume.
5024     redo A;
5025     }
5026     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5027 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5028     $self->{nc} <= 0x005A) or # x
5029     (0x0061 <= $self->{nc} and # a
5030     $self->{nc} <= 0x007A) or # z
5031     (0x0030 <= $self->{nc} and # 0
5032     $self->{nc} <= 0x0039) or # 9
5033 wakaba 1.22 $self->{nc} == 0x003B or # ;
5034     ($self->{is_xml} and
5035     not ($is_space->{$self->{nc}} or
5036     {
5037     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5038     $self->{entity_add} => 1,
5039     }->{$self->{nc}}))) {
5040 wakaba 1.1 our $EntityChar;
5041 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5042 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5043     $self->{ge}->{$self->{kwd}}) {
5044 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5045 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5046     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5047    
5048     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5049     } else {
5050     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5051    
5052     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5053     value => $self->{kwd});
5054     } else {
5055    
5056     }
5057     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5058     }
5059     } else {
5060     if ($self->{is_xml}) {
5061    
5062     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5063     value => $self->{kwd},
5064     level => {
5065     'amp;' => $self->{level}->{warn},
5066     'quot;' => $self->{level}->{warn},
5067     'lt;' => $self->{level}->{warn},
5068     'gt;' => $self->{level}->{warn},
5069     'apos;' => $self->{level}->{warn},
5070     }->{$self->{kwd}} ||
5071     $self->{level}->{must});
5072     } else {
5073    
5074     }
5075     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5076     }
5077 wakaba 1.1 $self->{entity__match} = 1;
5078    
5079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5080     $self->{line_prev} = $self->{line};
5081     $self->{column_prev} = $self->{column};
5082     $self->{column}++;
5083     $self->{nc}
5084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5085     } else {
5086     $self->{set_nc}->($self);
5087     }
5088    
5089     #
5090     } else {
5091    
5092 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5093 wakaba 1.1 $self->{entity__match} = -1;
5094     ## Stay in the state.
5095    
5096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5097     $self->{line_prev} = $self->{line};
5098     $self->{column_prev} = $self->{column};
5099     $self->{column}++;
5100     $self->{nc}
5101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5102     } else {
5103     $self->{set_nc}->($self);
5104     }
5105    
5106     redo A;
5107     }
5108     } else {
5109    
5110     $self->{entity__value} .= chr $self->{nc};
5111     $self->{entity__match} *= 2;
5112     ## Stay in the state.
5113    
5114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5115     $self->{line_prev} = $self->{line};
5116     $self->{column_prev} = $self->{column};
5117     $self->{column}++;
5118     $self->{nc}
5119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5120     } else {
5121     $self->{set_nc}->($self);
5122     }
5123    
5124     redo A;
5125     }
5126     }
5127    
5128     my $data;
5129     my $has_ref;
5130     if ($self->{entity__match} > 0) {
5131    
5132     $data = $self->{entity__value};
5133     $has_ref = 1;
5134     #
5135     } elsif ($self->{entity__match} < 0) {
5136     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5137     if ($self->{prev_state} != DATA_STATE and # in attribute
5138     $self->{entity__match} < -1) {
5139    
5140 wakaba 1.12 $data = '&' . $self->{kwd};
5141 wakaba 1.1 #
5142     } else {
5143    
5144     $data = $self->{entity__value};
5145     $has_ref = 1;
5146     #
5147     }
5148     } else {
5149    
5150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5151     line => $self->{line_prev},
5152 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5153     $data = '&' . $self->{kwd};
5154 wakaba 1.1 #
5155     }
5156    
5157     ## NOTE: In these cases, when a character reference is found,
5158     ## it is consumed and a character token is returned, or, otherwise,
5159     ## nothing is consumed and returned, according to the spec algorithm.
5160     ## In this implementation, anything that has been examined by the
5161     ## tokenizer is appended to the parent element or the attribute value
5162     ## as string, either literal string when no character reference or
5163     ## entity-replaced string otherwise, in this stage, since any characters
5164     ## that would not be consumed are appended in the data state or in an
5165     ## appropriate attribute value state anyway.
5166    
5167     if ($self->{prev_state} == DATA_STATE) {
5168    
5169     $self->{state} = $self->{prev_state};
5170 wakaba 1.5 $self->{s_kwd} = '';
5171 wakaba 1.1 ## Reconsume.
5172     return ({type => CHARACTER_TOKEN,
5173     data => $data,
5174 wakaba 1.7 has_reference => $has_ref,
5175 wakaba 1.1 line => $self->{line_prev},
5176 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5177 wakaba 1.1 });
5178     redo A;
5179     } else {
5180    
5181     $self->{ca}->{value} .= $data;
5182     $self->{ca}->{has_reference} = 1 if $has_ref;
5183     $self->{state} = $self->{prev_state};
5184 wakaba 1.5 $self->{s_kwd} = '';
5185 wakaba 1.1 ## Reconsume.
5186     redo A;
5187     }
5188 wakaba 1.8
5189     ## XML-only states
5190    
5191     } elsif ($self->{state} == PI_STATE) {
5192 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5193    
5194 wakaba 1.8 if ($is_space->{$self->{nc}} or
5195 wakaba 1.14 $self->{nc} == 0x003F or # ?
5196 wakaba 1.8 $self->{nc} == -1) {
5197 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5198     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5199     ## "DOCTYPE pi state": Parse error, switch to the "data
5200     ## state".
5201 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5202     line => $self->{line_prev},
5203     column => $self->{column_prev}
5204     - 1 * ($self->{nc} != -1));
5205     $self->{state} = BOGUS_COMMENT_STATE;
5206     ## Reconsume.
5207     $self->{ct} = {type => COMMENT_TOKEN,
5208     data => '?',
5209     line => $self->{line_prev},
5210     column => $self->{column_prev}
5211     - 1 * ($self->{nc} != -1),
5212     };
5213     redo A;
5214     } else {
5215 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5216 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5217     target => chr $self->{nc},
5218     data => '',
5219     line => $self->{line_prev},
5220     column => $self->{column_prev} - 1,
5221     };
5222     $self->{state} = PI_TARGET_STATE;
5223    
5224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5225     $self->{line_prev} = $self->{line};
5226     $self->{column_prev} = $self->{column};
5227     $self->{column}++;
5228     $self->{nc}
5229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5230     } else {
5231     $self->{set_nc}->($self);
5232     }
5233    
5234     redo A;
5235     }
5236     } elsif ($self->{state} == PI_TARGET_STATE) {
5237     if ($is_space->{$self->{nc}}) {
5238     $self->{state} = PI_TARGET_AFTER_STATE;
5239    
5240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5241     $self->{line_prev} = $self->{line};
5242     $self->{column_prev} = $self->{column};
5243     $self->{column}++;
5244     $self->{nc}
5245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5246     } else {
5247     $self->{set_nc}->($self);
5248     }
5249    
5250     redo A;
5251     } elsif ($self->{nc} == -1) {
5252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5253 wakaba 1.13 if ($self->{in_subset}) {
5254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5255     } else {
5256     $self->{state} = DATA_STATE;
5257     $self->{s_kwd} = '';
5258     }
5259 wakaba 1.8 ## Reconsume.
5260     return ($self->{ct}); # pi
5261     redo A;
5262     } elsif ($self->{nc} == 0x003F) { # ?
5263     $self->{state} = PI_AFTER_STATE;
5264    
5265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266     $self->{line_prev} = $self->{line};
5267     $self->{column_prev} = $self->{column};
5268     $self->{column}++;
5269     $self->{nc}
5270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271     } else {
5272     $self->{set_nc}->($self);
5273     }
5274    
5275     redo A;
5276     } else {
5277     ## XML5: typo ("tag name" -> "target")
5278     $self->{ct}->{target} .= chr $self->{nc}; # pi
5279    
5280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281     $self->{line_prev} = $self->{line};
5282     $self->{column_prev} = $self->{column};
5283     $self->{column}++;
5284     $self->{nc}
5285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286     } else {
5287     $self->{set_nc}->($self);
5288     }
5289    
5290     redo A;
5291     }
5292     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5293     if ($is_space->{$self->{nc}}) {
5294     ## Stay in the state.
5295    
5296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5297     $self->{line_prev} = $self->{line};
5298     $self->{column_prev} = $self->{column};
5299     $self->{column}++;
5300     $self->{nc}
5301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5302     } else {
5303     $self->{set_nc}->($self);
5304     }
5305    
5306     redo A;
5307     } else {
5308     $self->{state} = PI_DATA_STATE;
5309     ## Reprocess.
5310     redo A;
5311     }
5312     } elsif ($self->{state} == PI_DATA_STATE) {
5313     if ($self->{nc} == 0x003F) { # ?
5314     $self->{state} = PI_DATA_AFTER_STATE;
5315    
5316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5317     $self->{line_prev} = $self->{line};
5318     $self->{column_prev} = $self->{column};
5319     $self->{column}++;
5320     $self->{nc}
5321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5322     } else {
5323     $self->{set_nc}->($self);
5324     }
5325    
5326     redo A;
5327     } elsif ($self->{nc} == -1) {
5328     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5329 wakaba 1.13 if ($self->{in_subset}) {
5330 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5331 wakaba 1.13 } else {
5332     $self->{state} = DATA_STATE;
5333     $self->{s_kwd} = '';
5334     }
5335 wakaba 1.8 ## Reprocess.
5336     return ($self->{ct}); # pi
5337     redo A;
5338     } else {
5339     $self->{ct}->{data} .= chr $self->{nc}; # pi
5340     $self->{read_until}->($self->{ct}->{data}, q[?],
5341     length $self->{ct}->{data});
5342     ## Stay in the state.
5343    
5344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5345     $self->{line_prev} = $self->{line};
5346     $self->{column_prev} = $self->{column};
5347     $self->{column}++;
5348     $self->{nc}
5349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5350     } else {
5351     $self->{set_nc}->($self);
5352     }
5353    
5354     ## Reprocess.
5355     redo A;
5356     }
5357     } elsif ($self->{state} == PI_AFTER_STATE) {
5358 wakaba 1.14 ## XML5: Part of "Pi after state".
5359    
5360 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5361 wakaba 1.13 if ($self->{in_subset}) {
5362     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5363     } else {
5364     $self->{state} = DATA_STATE;
5365     $self->{s_kwd} = '';
5366     }
5367 wakaba 1.8
5368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5369     $self->{line_prev} = $self->{line};
5370     $self->{column_prev} = $self->{column};
5371     $self->{column}++;
5372     $self->{nc}
5373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5374     } else {
5375     $self->{set_nc}->($self);
5376     }
5377    
5378     return ($self->{ct}); # pi
5379     redo A;
5380     } elsif ($self->{nc} == 0x003F) { # ?
5381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5382     line => $self->{line_prev},
5383     column => $self->{column_prev}); ## XML5: no error
5384     $self->{ct}->{data} .= '?';
5385     $self->{state} = PI_DATA_AFTER_STATE;
5386    
5387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5388     $self->{line_prev} = $self->{line};
5389     $self->{column_prev} = $self->{column};
5390     $self->{column}++;
5391     $self->{nc}
5392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5393     } else {
5394     $self->{set_nc}->($self);
5395     }
5396    
5397     redo A;
5398     } else {
5399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5400     line => $self->{line_prev},
5401     column => $self->{column_prev}
5402     + 1 * ($self->{nc} == -1)); ## XML5: no error
5403     $self->{ct}->{data} .= '?'; ## XML5: not appended
5404     $self->{state} = PI_DATA_STATE;
5405     ## Reprocess.
5406     redo A;
5407     }
5408     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5409 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5410    
5411 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5412 wakaba 1.13 if ($self->{in_subset}) {
5413     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5414     } else {
5415     $self->{state} = DATA_STATE;
5416     $self->{s_kwd} = '';
5417     }
5418 wakaba 1.8
5419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420     $self->{line_prev} = $self->{line};
5421     $self->{column_prev} = $self->{column};
5422     $self->{column}++;
5423     $self->{nc}
5424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425     } else {
5426     $self->{set_nc}->($self);
5427     }
5428    
5429     return ($self->{ct}); # pi
5430     redo A;
5431     } elsif ($self->{nc} == 0x003F) { # ?
5432     $self->{ct}->{data} .= '?';
5433     ## Stay in the state.
5434    
5435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436     $self->{line_prev} = $self->{line};
5437     $self->{column_prev} = $self->{column};
5438     $self->{column}++;
5439     $self->{nc}
5440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441     } else {
5442     $self->{set_nc}->($self);
5443     }
5444    
5445     redo A;
5446     } else {
5447     $self->{ct}->{data} .= '?'; ## XML5: not appended
5448     $self->{state} = PI_DATA_STATE;
5449     ## Reprocess.
5450     redo A;
5451     }
5452 wakaba 1.12
5453     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5454     if ($self->{nc} == 0x003C) { # <
5455 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5456 wakaba 1.12
5457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458     $self->{line_prev} = $self->{line};
5459     $self->{column_prev} = $self->{column};
5460     $self->{column}++;
5461     $self->{nc}
5462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463     } else {
5464     $self->{set_nc}->($self);
5465     }
5466    
5467     redo A;
5468     } elsif ($self->{nc} == 0x0025) { # %
5469     ## XML5: Not defined yet.
5470    
5471     ## TODO:
5472 wakaba 1.24
5473     if (not $self->{stop_processing} and
5474     not $self->{document}->xml_standalone) {
5475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5476     level => $self->{level}->{info});
5477     $self->{stop_processing} = 1;
5478     }
5479    
5480 wakaba 1.12
5481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5482     $self->{line_prev} = $self->{line};
5483     $self->{column_prev} = $self->{column};
5484     $self->{column}++;
5485     $self->{nc}
5486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5487     } else {
5488     $self->{set_nc}->($self);
5489     }
5490    
5491     redo A;
5492     } elsif ($self->{nc} == 0x005D) { # ]
5493 wakaba 1.13 delete $self->{in_subset};
5494 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5495    
5496     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5497     $self->{line_prev} = $self->{line};
5498     $self->{column_prev} = $self->{column};
5499     $self->{column}++;
5500     $self->{nc}
5501     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5502     } else {
5503     $self->{set_nc}->($self);
5504     }
5505    
5506     redo A;
5507     } elsif ($is_space->{$self->{nc}}) {
5508     ## Stay in the state.
5509    
5510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5511     $self->{line_prev} = $self->{line};
5512     $self->{column_prev} = $self->{column};
5513     $self->{column}++;
5514     $self->{nc}
5515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5516     } else {
5517     $self->{set_nc}->($self);
5518     }
5519    
5520     redo A;
5521     } elsif ($self->{nc} == -1) {
5522     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5523 wakaba 1.13 delete $self->{in_subset};
5524 wakaba 1.12 $self->{state} = DATA_STATE;
5525     $self->{s_kwd} = '';
5526     ## Reconsume.
5527 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5528 wakaba 1.12 redo A;
5529     } else {
5530     unless ($self->{internal_subset_tainted}) {
5531     ## XML5: No parse error.
5532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5533     $self->{internal_subset_tainted} = 1;
5534     }
5535     ## Stay in the state.
5536    
5537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5538     $self->{line_prev} = $self->{line};
5539     $self->{column_prev} = $self->{column};
5540     $self->{column}++;
5541     $self->{nc}
5542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5543     } else {
5544     $self->{set_nc}->($self);
5545     }
5546    
5547     redo A;
5548     }
5549     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5550     if ($self->{nc} == 0x003E) { # >
5551     $self->{state} = DATA_STATE;
5552     $self->{s_kwd} = '';
5553    
5554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5555     $self->{line_prev} = $self->{line};
5556     $self->{column_prev} = $self->{column};
5557     $self->{column}++;
5558     $self->{nc}
5559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5560     } else {
5561     $self->{set_nc}->($self);
5562     }
5563    
5564 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5565 wakaba 1.12 redo A;
5566     } elsif ($self->{nc} == -1) {
5567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5568     $self->{state} = DATA_STATE;
5569     $self->{s_kwd} = '';
5570     ## Reconsume.
5571 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5572 wakaba 1.12 redo A;
5573     } else {
5574     ## XML5: No parse error and stay in the state.
5575     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5576    
5577 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5578    
5579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5580     $self->{line_prev} = $self->{line};
5581     $self->{column_prev} = $self->{column};
5582     $self->{column}++;
5583     $self->{nc}
5584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5585     } else {
5586     $self->{set_nc}->($self);
5587     }
5588    
5589     redo A;
5590     }
5591     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5592     if ($self->{nc} == 0x003E) { # >
5593     $self->{state} = DATA_STATE;
5594     $self->{s_kwd} = '';
5595    
5596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5597     $self->{line_prev} = $self->{line};
5598     $self->{column_prev} = $self->{column};
5599     $self->{column}++;
5600     $self->{nc}
5601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5602     } else {
5603     $self->{set_nc}->($self);
5604     }
5605    
5606     return ({type => END_OF_DOCTYPE_TOKEN});
5607     redo A;
5608     } elsif ($self->{nc} == -1) {
5609     $self->{state} = DATA_STATE;
5610     $self->{s_kwd} = '';
5611     ## Reconsume.
5612     return ({type => END_OF_DOCTYPE_TOKEN});
5613     redo A;
5614     } else {
5615     ## Stay in the state.
5616    
5617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618     $self->{line_prev} = $self->{line};
5619     $self->{column_prev} = $self->{column};
5620     $self->{column}++;
5621     $self->{nc}
5622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623     } else {
5624     $self->{set_nc}->($self);
5625     }
5626    
5627     redo A;
5628     }
5629     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5630     if ($self->{nc} == 0x0021) { # !
5631 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5632 wakaba 1.13
5633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634     $self->{line_prev} = $self->{line};
5635     $self->{column_prev} = $self->{column};
5636     $self->{column}++;
5637     $self->{nc}
5638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639     } else {
5640     $self->{set_nc}->($self);
5641     }
5642    
5643     redo A;
5644     } elsif ($self->{nc} == 0x003F) { # ?
5645     $self->{state} = PI_STATE;
5646    
5647     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5648     $self->{line_prev} = $self->{line};
5649     $self->{column_prev} = $self->{column};
5650     $self->{column}++;
5651     $self->{nc}
5652     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5653     } else {
5654     $self->{set_nc}->($self);
5655     }
5656    
5657     redo A;
5658     } elsif ($self->{nc} == -1) {
5659     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5660     $self->{state} = DATA_STATE;
5661     $self->{s_kwd} = '';
5662     ## Reconsume.
5663     redo A;
5664     } else {
5665     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5666     line => $self->{line_prev},
5667     column => $self->{column_prev});
5668     $self->{state} = BOGUS_COMMENT_STATE;
5669     $self->{ct} = {type => COMMENT_TOKEN,
5670     data => '',
5671     }; ## NOTE: Will be discarded.
5672 wakaba 1.12
5673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5674     $self->{line_prev} = $self->{line};
5675     $self->{column_prev} = $self->{column};
5676     $self->{column}++;
5677     $self->{nc}
5678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5679     } else {
5680     $self->{set_nc}->($self);
5681     }
5682    
5683     redo A;
5684     }
5685 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5686     ## XML5: "DOCTYPE markup declaration state".
5687    
5688     if ($self->{nc} == 0x002D) { # -
5689     $self->{state} = MD_HYPHEN_STATE;
5690    
5691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5692     $self->{line_prev} = $self->{line};
5693     $self->{column_prev} = $self->{column};
5694     $self->{column}++;
5695     $self->{nc}
5696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5697     } else {
5698     $self->{set_nc}->($self);
5699     }
5700    
5701     redo A;
5702 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5703     $self->{nc} == 0x0065) { # e
5704 wakaba 1.14 $self->{state} = MD_E_STATE;
5705     $self->{kwd} = chr $self->{nc};
5706    
5707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708     $self->{line_prev} = $self->{line};
5709     $self->{column_prev} = $self->{column};
5710     $self->{column}++;
5711     $self->{nc}
5712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713     } else {
5714     $self->{set_nc}->($self);
5715     }
5716    
5717     redo A;
5718 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5719     $self->{nc} == 0x0061) { # a
5720 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5721     $self->{kwd} = chr $self->{nc};
5722    
5723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5724     $self->{line_prev} = $self->{line};
5725     $self->{column_prev} = $self->{column};
5726     $self->{column}++;
5727     $self->{nc}
5728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5729     } else {
5730     $self->{set_nc}->($self);
5731     }
5732    
5733     redo A;
5734 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5735     $self->{nc} == 0x006E) { # n
5736 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5737     $self->{kwd} = chr $self->{nc};
5738    
5739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5740     $self->{line_prev} = $self->{line};
5741     $self->{column_prev} = $self->{column};
5742     $self->{column}++;
5743     $self->{nc}
5744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5745     } else {
5746     $self->{set_nc}->($self);
5747     }
5748    
5749     redo A;
5750     } else {
5751     #
5752     }
5753    
5754     ## XML5: No parse error.
5755     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5756     line => $self->{line_prev},
5757     column => $self->{column_prev} - 1);
5758     ## Reconsume.
5759     $self->{state} = BOGUS_COMMENT_STATE;
5760     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5761     redo A;
5762     } elsif ($self->{state} == MD_E_STATE) {
5763 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5764     $self->{nc} == 0x006E) { # n
5765 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5766     $self->{kwd} .= chr $self->{nc};
5767    
5768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769     $self->{line_prev} = $self->{line};
5770     $self->{column_prev} = $self->{column};
5771     $self->{column}++;
5772     $self->{nc}
5773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774     } else {
5775     $self->{set_nc}->($self);
5776     }
5777    
5778     redo A;
5779 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5780     $self->{nc} == 0x006C) { # l
5781 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5782     $self->{state} = MD_ELEMENT_STATE;
5783     $self->{kwd} .= chr $self->{nc};
5784    
5785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5786     $self->{line_prev} = $self->{line};
5787     $self->{column_prev} = $self->{column};
5788     $self->{column}++;
5789     $self->{nc}
5790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5791     } else {
5792     $self->{set_nc}->($self);
5793     }
5794    
5795     redo A;
5796     } else {
5797     ## XML5: No parse error.
5798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5799     line => $self->{line_prev},
5800     column => $self->{column_prev} - 2
5801     + 1 * ($self->{nc} == -1));
5802     ## Reconsume.
5803     $self->{state} = BOGUS_COMMENT_STATE;
5804     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5805     redo A;
5806     }
5807     } elsif ($self->{state} == MD_ENTITY_STATE) {
5808 wakaba 1.17 if ($self->{nc} == [
5809     undef,
5810     undef,
5811     0x0054, # T
5812     0x0049, # I
5813     0x0054, # T
5814     ]->[length $self->{kwd}] or
5815     $self->{nc} == [
5816     undef,
5817     undef,
5818     0x0074, # t
5819     0x0069, # i
5820     0x0074, # t
5821     ]->[length $self->{kwd}]) {
5822 wakaba 1.14 ## Stay in the state.
5823     $self->{kwd} .= chr $self->{nc};
5824    
5825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5826     $self->{line_prev} = $self->{line};
5827     $self->{column_prev} = $self->{column};
5828     $self->{column}++;
5829     $self->{nc}
5830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5831     } else {
5832     $self->{set_nc}->($self);
5833     }
5834    
5835     redo A;
5836 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5837     ($self->{nc} == 0x0059 or # Y
5838     $self->{nc} == 0x0079)) { # y
5839     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5841     text => 'ENTITY',
5842     line => $self->{line_prev},
5843     column => $self->{column_prev} - 4);
5844     }
5845     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5846 wakaba 1.14 line => $self->{line_prev},
5847     column => $self->{column_prev} - 6};
5848     $self->{state} = DOCTYPE_MD_STATE;
5849    
5850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5851     $self->{line_prev} = $self->{line};
5852     $self->{column_prev} = $self->{column};
5853     $self->{column}++;
5854     $self->{nc}
5855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5856     } else {
5857     $self->{set_nc}->($self);
5858     }
5859    
5860     redo A;
5861     } else {
5862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5863     line => $self->{line_prev},
5864     column => $self->{column_prev} - 1
5865     - (length $self->{kwd})
5866     + 1 * ($self->{nc} == -1));
5867     $self->{state} = BOGUS_COMMENT_STATE;
5868     ## Reconsume.
5869     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5870     redo A;
5871     }
5872     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5873 wakaba 1.17 if ($self->{nc} == [
5874     undef,
5875     undef,
5876     0x0045, # E
5877     0x004D, # M
5878     0x0045, # E
5879     0x004E, # N
5880     ]->[length $self->{kwd}] or
5881     $self->{nc} == [
5882     undef,
5883     undef,
5884     0x0065, # e
5885     0x006D, # m
5886     0x0065, # e
5887     0x006E, # n
5888     ]->[length $self->{kwd}]) {
5889 wakaba 1.14 ## Stay in the state.
5890     $self->{kwd} .= chr $self->{nc};
5891    
5892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5893     $self->{line_prev} = $self->{line};
5894     $self->{column_prev} = $self->{column};
5895     $self->{column}++;
5896     $self->{nc}
5897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5898     } else {
5899     $self->{set_nc}->($self);
5900     }
5901    
5902     redo A;
5903 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5904     ($self->{nc} == 0x0054 or # T
5905     $self->{nc} == 0x0074)) { # t
5906     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5908     text => 'ELEMENT',
5909     line => $self->{line_prev},
5910     column => $self->{column_prev} - 5);
5911     }
5912 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5913     line => $self->{line_prev},
5914 wakaba 1.23 column => $self->{column_prev} - 7};
5915 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5916    
5917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5918     $self->{line_prev} = $self->{line};
5919     $self->{column_prev} = $self->{column};
5920     $self->{column}++;
5921     $self->{nc}
5922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5923     } else {
5924     $self->{set_nc}->($self);
5925     }
5926    
5927     redo A;
5928     } else {
5929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5930     line => $self->{line_prev},
5931     column => $self->{column_prev} - 1
5932     - (length $self->{kwd})
5933     + 1 * ($self->{nc} == -1));
5934     $self->{state} = BOGUS_COMMENT_STATE;
5935     ## Reconsume.
5936     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5937     redo A;
5938     }
5939     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5940 wakaba 1.17 if ($self->{nc} == [
5941     undef,
5942     0x0054, # T
5943     0x0054, # T
5944     0x004C, # L
5945     0x0049, # I
5946     0x0053, # S
5947     ]->[length $self->{kwd}] or
5948     $self->{nc} == [
5949     undef,
5950     0x0074, # t
5951     0x0074, # t
5952     0x006C, # l
5953     0x0069, # i
5954     0x0073, # s
5955     ]->[length $self->{kwd}]) {
5956 wakaba 1.14 ## Stay in the state.
5957     $self->{kwd} .= chr $self->{nc};
5958    
5959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5960     $self->{line_prev} = $self->{line};
5961     $self->{column_prev} = $self->{column};
5962     $self->{column}++;
5963     $self->{nc}
5964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5965     } else {
5966     $self->{set_nc}->($self);
5967     }
5968    
5969     redo A;
5970 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5971     ($self->{nc} == 0x0054 or # T
5972     $self->{nc} == 0x0074)) { # t
5973     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5975     text => 'ATTLIST',
5976     line => $self->{line_prev},
5977     column => $self->{column_prev} - 5);
5978     }
5979 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5980 wakaba 1.15 attrdefs => [],
5981 wakaba 1.14 line => $self->{line_prev},
5982 wakaba 1.23 column => $self->{column_prev} - 7};
5983 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5984    
5985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5986     $self->{line_prev} = $self->{line};
5987     $self->{column_prev} = $self->{column};
5988     $self->{column}++;
5989     $self->{nc}
5990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5991     } else {
5992     $self->{set_nc}->($self);
5993     }
5994    
5995     redo A;
5996     } else {
5997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5998     line => $self->{line_prev},
5999     column => $self->{column_prev} - 1
6000     - (length $self->{kwd})
6001     + 1 * ($self->{nc} == -1));
6002     $self->{state} = BOGUS_COMMENT_STATE;
6003     ## Reconsume.
6004     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6005     redo A;
6006     }
6007     } elsif ($self->{state} == MD_NOTATION_STATE) {
6008 wakaba 1.17 if ($self->{nc} == [
6009     undef,
6010     0x004F, # O
6011     0x0054, # T
6012     0x0041, # A
6013     0x0054, # T
6014     0x0049, # I
6015     0x004F, # O
6016     ]->[length $self->{kwd}] or
6017     $self->{nc} == [
6018     undef,
6019     0x006F, # o
6020     0x0074, # t
6021     0x0061, # a
6022     0x0074, # t
6023     0x0069, # i
6024     0x006F, # o
6025     ]->[length $self->{kwd}]) {
6026 wakaba 1.14 ## Stay in the state.
6027     $self->{kwd} .= chr $self->{nc};
6028    
6029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6030     $self->{line_prev} = $self->{line};
6031     $self->{column_prev} = $self->{column};
6032     $self->{column}++;
6033     $self->{nc}
6034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6035     } else {
6036     $self->{set_nc}->($self);
6037     }
6038    
6039     redo A;
6040 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6041     ($self->{nc} == 0x004E or # N
6042     $self->{nc} == 0x006E)) { # n
6043     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6044     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6045     text => 'NOTATION',
6046     line => $self->{line_prev},
6047     column => $self->{column_prev} - 6);
6048     }
6049 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6050     line => $self->{line_prev},
6051 wakaba 1.23 column => $self->{column_prev} - 8};
6052 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6053    
6054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6055     $self->{line_prev} = $self->{line};
6056     $self->{column_prev} = $self->{column};
6057     $self->{column}++;
6058     $self->{nc}
6059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6060     } else {
6061     $self->{set_nc}->($self);
6062     }
6063    
6064     redo A;
6065     } else {
6066     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6067     line => $self->{line_prev},
6068     column => $self->{column_prev} - 1
6069     - (length $self->{kwd})
6070     + 1 * ($self->{nc} == -1));
6071     $self->{state} = BOGUS_COMMENT_STATE;
6072     ## Reconsume.
6073     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6074     redo A;
6075     }
6076     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6077     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6078     ## "DOCTYPE NOTATION state".
6079    
6080     if ($is_space->{$self->{nc}}) {
6081     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6082     $self->{state} = BEFORE_MD_NAME_STATE;
6083    
6084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6085     $self->{line_prev} = $self->{line};
6086     $self->{column_prev} = $self->{column};
6087     $self->{column}++;
6088     $self->{nc}
6089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6090     } else {
6091     $self->{set_nc}->($self);
6092     }
6093    
6094     redo A;
6095     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6096     $self->{nc} == 0x0025) { # %
6097     ## XML5: Switch to the "DOCTYPE bogus comment state".
6098     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6099     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6100    
6101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102     $self->{line_prev} = $self->{line};
6103     $self->{column_prev} = $self->{column};
6104     $self->{column}++;
6105     $self->{nc}
6106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107     } else {
6108     $self->{set_nc}->($self);
6109     }
6110    
6111     redo A;
6112     } elsif ($self->{nc} == -1) {
6113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6115     ## Reconsume.
6116     redo A;
6117     } elsif ($self->{nc} == 0x003E) { # >
6118     ## XML5: Switch to the "DOCTYPE bogus comment state".
6119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6120     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6121    
6122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6123     $self->{line_prev} = $self->{line};
6124     $self->{column_prev} = $self->{column};
6125     $self->{column}++;
6126     $self->{nc}
6127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6128     } else {
6129     $self->{set_nc}->($self);
6130     }
6131    
6132     redo A;
6133     } else {
6134     ## XML5: Switch to the "DOCTYPE bogus comment state".
6135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6136     $self->{state} = BEFORE_MD_NAME_STATE;
6137     redo A;
6138     }
6139     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6140     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6141     ## before state", "DOCTYPE ATTLIST name before state".
6142    
6143     if ($is_space->{$self->{nc}}) {
6144     ## Stay in the state.
6145    
6146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6147     $self->{line_prev} = $self->{line};
6148     $self->{column_prev} = $self->{column};
6149     $self->{column}++;
6150     $self->{nc}
6151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6152     } else {
6153     $self->{set_nc}->($self);
6154     }
6155    
6156     redo A;
6157     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6158     $self->{nc} == 0x0025) { # %
6159     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6160    
6161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162     $self->{line_prev} = $self->{line};
6163     $self->{column_prev} = $self->{column};
6164     $self->{column}++;
6165     $self->{nc}
6166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167     } else {
6168     $self->{set_nc}->($self);
6169     }
6170    
6171     redo A;
6172     } elsif ($self->{nc} == 0x003E) { # >
6173     ## XML5: Same as "Anything else".
6174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6175     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6176    
6177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6178     $self->{line_prev} = $self->{line};
6179     $self->{column_prev} = $self->{column};
6180     $self->{column}++;
6181     $self->{nc}
6182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6183     } else {
6184     $self->{set_nc}->($self);
6185     }
6186    
6187     redo A;
6188     } elsif ($self->{nc} == -1) {
6189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6190     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6191     ## Reconsume.
6192     redo A;
6193     } else {
6194     ## XML5: [ATTLIST] Not defined yet.
6195     $self->{ct}->{name} .= chr $self->{nc};
6196     $self->{state} = MD_NAME_STATE;
6197    
6198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6199     $self->{line_prev} = $self->{line};
6200     $self->{column_prev} = $self->{column};
6201     $self->{column}++;
6202     $self->{nc}
6203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6204     } else {
6205     $self->{set_nc}->($self);
6206     }
6207    
6208     redo A;
6209     }
6210     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6211     if ($is_space->{$self->{nc}}) {
6212     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6213     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6214     $self->{state} = BEFORE_MD_NAME_STATE;
6215 wakaba 1.8
6216 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6217     $self->{line_prev} = $self->{line};
6218     $self->{column_prev} = $self->{column};
6219     $self->{column}++;
6220     $self->{nc}
6221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6222     } else {
6223     $self->{set_nc}->($self);
6224     }
6225    
6226     redo A;
6227     } elsif ($self->{nc} == 0x003E) { # >
6228     ## XML5: Same as "Anything else".
6229     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6230     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6231    
6232     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233     $self->{line_prev} = $self->{line};
6234     $self->{column_prev} = $self->{column};
6235     $self->{column}++;
6236     $self->{nc}
6237     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238     } else {
6239     $self->{set_nc}->($self);
6240     }
6241    
6242     redo A;
6243     } elsif ($self->{nc} == -1) {
6244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6246     ## Reconsume.
6247     redo A;
6248     } else {
6249     ## XML5: No parse error.
6250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6251     $self->{state} = BOGUS_COMMENT_STATE;
6252     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6253     ## Reconsume.
6254     redo A;
6255     }
6256     } elsif ($self->{state} == MD_NAME_STATE) {
6257     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6258    
6259     if ($is_space->{$self->{nc}}) {
6260 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6261     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6262     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6263 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6264 wakaba 1.16 } else { # ENTITY/NOTATION
6265     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6266     }
6267 wakaba 1.14
6268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6269     $self->{line_prev} = $self->{line};
6270     $self->{column_prev} = $self->{column};
6271     $self->{column}++;
6272     $self->{nc}
6273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6274     } else {
6275     $self->{set_nc}->($self);
6276     }
6277    
6278     redo A;
6279     } elsif ($self->{nc} == 0x003E) { # >
6280     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6281     #
6282     } else {
6283 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6284 wakaba 1.14 }
6285     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6286    
6287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6288     $self->{line_prev} = $self->{line};
6289     $self->{column_prev} = $self->{column};
6290     $self->{column}++;
6291     $self->{nc}
6292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6293     } else {
6294     $self->{set_nc}->($self);
6295     }
6296    
6297     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6298     redo A;
6299     } elsif ($self->{nc} == -1) {
6300     ## XML5: [ATTLIST] No parse error.
6301     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6302     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6303     ## Reconsume.
6304     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6305     redo A;
6306     } else {
6307     ## XML5: [ATTLIST] Not defined yet.
6308     $self->{ct}->{name} .= chr $self->{nc};
6309     ## Stay in the state.
6310    
6311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6312     $self->{line_prev} = $self->{line};
6313     $self->{column_prev} = $self->{column};
6314     $self->{column}++;
6315     $self->{nc}
6316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6317     } else {
6318     $self->{set_nc}->($self);
6319     }
6320    
6321     redo A;
6322     }
6323     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6324     if ($is_space->{$self->{nc}}) {
6325     ## Stay in the state.
6326    
6327     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6328     $self->{line_prev} = $self->{line};
6329     $self->{column_prev} = $self->{column};
6330     $self->{column}++;
6331     $self->{nc}
6332     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6333     } else {
6334     $self->{set_nc}->($self);
6335     }
6336    
6337     redo A;
6338     } elsif ($self->{nc} == 0x003E) { # >
6339     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6340    
6341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6342     $self->{line_prev} = $self->{line};
6343     $self->{column_prev} = $self->{column};
6344     $self->{column}++;
6345     $self->{nc}
6346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6347     } else {
6348     $self->{set_nc}->($self);
6349     }
6350    
6351     return ($self->{ct}); # ATTLIST
6352     redo A;
6353     } elsif ($self->{nc} == -1) {
6354     ## XML5: No parse error.
6355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6356     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6357 wakaba 1.15 return ($self->{ct});
6358 wakaba 1.14 redo A;
6359     } else {
6360     ## XML5: Not defined yet.
6361 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6362     tokens => [],
6363     line => $self->{line}, column => $self->{column}};
6364     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6365    
6366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367     $self->{line_prev} = $self->{line};
6368     $self->{column_prev} = $self->{column};
6369     $self->{column}++;
6370     $self->{nc}
6371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372     } else {
6373     $self->{set_nc}->($self);
6374     }
6375    
6376     redo A;
6377     }
6378     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6379     if ($is_space->{$self->{nc}}) {
6380     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6381    
6382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6383     $self->{line_prev} = $self->{line};
6384     $self->{column_prev} = $self->{column};
6385     $self->{column}++;
6386     $self->{nc}
6387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6388     } else {
6389     $self->{set_nc}->($self);
6390     }
6391    
6392     redo A;
6393     } elsif ($self->{nc} == 0x003E) { # >
6394     ## XML5: Same as "anything else".
6395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6396     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6397    
6398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399     $self->{line_prev} = $self->{line};
6400     $self->{column_prev} = $self->{column};
6401     $self->{column}++;
6402     $self->{nc}
6403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404     } else {
6405     $self->{set_nc}->($self);
6406     }
6407    
6408     return ($self->{ct}); # ATTLIST
6409     redo A;
6410     } elsif ($self->{nc} == 0x0028) { # (
6411     ## XML5: Same as "anything else".
6412     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6413     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6414    
6415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6416     $self->{line_prev} = $self->{line};
6417     $self->{column_prev} = $self->{column};
6418     $self->{column}++;
6419     $self->{nc}
6420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6421     } else {
6422     $self->{set_nc}->($self);
6423     }
6424    
6425     redo A;
6426     } elsif ($self->{nc} == -1) {
6427     ## XML5: No parse error.
6428     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6429     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6430    
6431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6432     $self->{line_prev} = $self->{line};
6433     $self->{column_prev} = $self->{column};
6434     $self->{column}++;
6435     $self->{nc}
6436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6437     } else {
6438     $self->{set_nc}->($self);
6439     }
6440    
6441     return ($self->{ct}); # ATTLIST
6442     redo A;
6443     } else {
6444     ## XML5: Not defined yet.
6445     $self->{ca}->{name} .= chr $self->{nc};
6446     ## Stay in the state.
6447    
6448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6449     $self->{line_prev} = $self->{line};
6450     $self->{column_prev} = $self->{column};
6451     $self->{column}++;
6452     $self->{nc}
6453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6454     } else {
6455     $self->{set_nc}->($self);
6456     }
6457    
6458 wakaba 1.14 redo A;
6459     }
6460 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6461     if ($is_space->{$self->{nc}}) {
6462     ## Stay in the state.
6463    
6464     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6465     $self->{line_prev} = $self->{line};
6466     $self->{column_prev} = $self->{column};
6467     $self->{column}++;
6468     $self->{nc}
6469     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6470     } else {
6471     $self->{set_nc}->($self);
6472     }
6473    
6474     redo A;
6475     } elsif ($self->{nc} == 0x003E) { # >
6476     ## XML5: Same as "anything else".
6477     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6479    
6480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6481     $self->{line_prev} = $self->{line};
6482     $self->{column_prev} = $self->{column};
6483     $self->{column}++;
6484     $self->{nc}
6485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6486     } else {
6487     $self->{set_nc}->($self);
6488     }
6489    
6490     return ($self->{ct}); # ATTLIST
6491     redo A;
6492     } elsif ($self->{nc} == 0x0028) { # (
6493     ## XML5: Same as "anything else".
6494     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6495    
6496     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6497     $self->{line_prev} = $self->{line};
6498     $self->{column_prev} = $self->{column};
6499     $self->{column}++;
6500     $self->{nc}
6501     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6502     } else {
6503     $self->{set_nc}->($self);
6504     }
6505    
6506     redo A;
6507     } elsif ($self->{nc} == -1) {
6508     ## XML5: No parse error.
6509     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6510     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6511    
6512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6513     $self->{line_prev} = $self->{line};
6514     $self->{column_prev} = $self->{column};
6515     $self->{column}++;
6516     $self->{nc}
6517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6518     } else {
6519     $self->{set_nc}->($self);
6520     }
6521    
6522     return ($self->{ct});
6523     redo A;
6524     } else {
6525     ## XML5: Not defined yet.
6526     $self->{ca}->{type} = chr $self->{nc};
6527     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6528    
6529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6530     $self->{line_prev} = $self->{line};
6531     $self->{column_prev} = $self->{column};
6532     $self->{column}++;
6533     $self->{nc}
6534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6535     } else {
6536     $self->{set_nc}->($self);
6537     }
6538    
6539     redo A;
6540     }
6541     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6542     if ($is_space->{$self->{nc}}) {
6543     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6544    
6545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6546     $self->{line_prev} = $self->{line};
6547     $self->{column_prev} = $self->{column};
6548     $self->{column}++;
6549     $self->{nc}
6550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6551     } else {
6552     $self->{set_nc}->($self);
6553     }
6554    
6555     redo A;
6556     } elsif ($self->{nc} == 0x0023) { # #
6557     ## XML5: Same as "anything else".
6558     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6559     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6560    
6561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6562     $self->{line_prev} = $self->{line};
6563     $self->{column_prev} = $self->{column};
6564     $self->{column}++;
6565     $self->{nc}
6566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6567     } else {
6568     $self->{set_nc}->($self);
6569     }
6570    
6571     redo A;
6572     } elsif ($self->{nc} == 0x0022) { # "
6573     ## XML5: Same as "anything else".
6574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6575     $self->{ca}->{value} = '';
6576     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6577    
6578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579     $self->{line_prev} = $self->{line};
6580     $self->{column_prev} = $self->{column};
6581     $self->{column}++;
6582     $self->{nc}
6583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584     } else {
6585     $self->{set_nc}->($self);
6586     }
6587    
6588     redo A;
6589     } elsif ($self->{nc} == 0x0027) { # '
6590     ## XML5: Same as "anything else".
6591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6592     $self->{ca}->{value} = '';
6593     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6594    
6595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6596     $self->{line_prev} = $self->{line};
6597     $self->{column_prev} = $self->{column};
6598     $self->{column}++;
6599     $self->{nc}
6600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6601     } else {
6602     $self->{set_nc}->($self);
6603     }
6604    
6605     redo A;
6606     } elsif ($self->{nc} == 0x003E) { # >
6607     ## XML5: Same as "anything else".
6608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6609     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6610    
6611     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6612     $self->{line_prev} = $self->{line};
6613     $self->{column_prev} = $self->{column};
6614     $self->{column}++;
6615     $self->{nc}
6616     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6617     } else {
6618     $self->{set_nc}->($self);
6619     }
6620    
6621     return ($self->{ct}); # ATTLIST
6622     redo A;
6623     } elsif ($self->{nc} == 0x0028) { # (
6624     ## XML5: Same as "anything else".
6625     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6626     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6627    
6628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629     $self->{line_prev} = $self->{line};
6630     $self->{column_prev} = $self->{column};
6631     $self->{column}++;
6632     $self->{nc}
6633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634     } else {
6635     $self->{set_nc}->($self);
6636     }
6637    
6638     redo A;
6639     } elsif ($self->{nc} == -1) {
6640     ## XML5: No parse error.
6641     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6643    
6644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6645     $self->{line_prev} = $self->{line};
6646     $self->{column_prev} = $self->{column};
6647     $self->{column}++;
6648     $self->{nc}
6649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6650     } else {
6651     $self->{set_nc}->($self);
6652     }
6653    
6654     return ($self->{ct});
6655     redo A;
6656     } else {
6657     ## XML5: Not defined yet.
6658     $self->{ca}->{type} .= chr $self->{nc};
6659     ## Stay in the state.
6660    
6661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662     $self->{line_prev} = $self->{line};
6663     $self->{column_prev} = $self->{column};
6664     $self->{column}++;
6665     $self->{nc}
6666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667     } else {
6668     $self->{set_nc}->($self);
6669     }
6670    
6671     redo A;
6672     }
6673     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6674     if ($is_space->{$self->{nc}}) {
6675     ## Stay in the state.
6676    
6677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678     $self->{line_prev} = $self->{line};
6679     $self->{column_prev} = $self->{column};
6680     $self->{column}++;
6681     $self->{nc}
6682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683     } else {
6684     $self->{set_nc}->($self);
6685     }
6686    
6687     redo A;
6688     } elsif ($self->{nc} == 0x0028) { # (
6689     ## XML5: Same as "anything else".
6690     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6691    
6692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6693     $self->{line_prev} = $self->{line};
6694     $self->{column_prev} = $self->{column};
6695     $self->{column}++;
6696     $self->{nc}
6697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6698     } else {
6699     $self->{set_nc}->($self);
6700     }
6701    
6702     redo A;
6703     } elsif ($self->{nc} == 0x0023) { # #
6704     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6705    
6706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6707     $self->{line_prev} = $self->{line};
6708     $self->{column_prev} = $self->{column};
6709     $self->{column}++;
6710     $self->{nc}
6711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6712     } else {
6713     $self->{set_nc}->($self);
6714     }
6715    
6716     redo A;
6717     } elsif ($self->{nc} == 0x0022) { # "
6718     ## XML5: Same as "anything else".
6719     $self->{ca}->{value} = '';
6720     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6721    
6722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6723     $self->{line_prev} = $self->{line};
6724     $self->{column_prev} = $self->{column};
6725     $self->{column}++;
6726     $self->{nc}
6727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6728     } else {
6729     $self->{set_nc}->($self);
6730     }
6731    
6732     redo A;
6733     } elsif ($self->{nc} == 0x0027) { # '
6734     ## XML5: Same as "anything else".
6735     $self->{ca}->{value} = '';
6736     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6737    
6738     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6739     $self->{line_prev} = $self->{line};
6740     $self->{column_prev} = $self->{column};
6741     $self->{column}++;
6742     $self->{nc}
6743     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6744     } else {
6745     $self->{set_nc}->($self);
6746     }
6747    
6748     redo A;
6749     } elsif ($self->{nc} == 0x003E) { # >
6750     ## XML5: Same as "anything else".
6751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6753    
6754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6755     $self->{line_prev} = $self->{line};
6756     $self->{column_prev} = $self->{column};
6757     $self->{column}++;
6758     $self->{nc}
6759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6760     } else {
6761     $self->{set_nc}->($self);
6762     }
6763    
6764     return ($self->{ct}); # ATTLIST
6765     redo A;
6766     } elsif ($self->{nc} == -1) {
6767     ## XML5: No parse error.
6768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6769     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6770    
6771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6772     $self->{line_prev} = $self->{line};
6773     $self->{column_prev} = $self->{column};
6774     $self->{column}++;
6775     $self->{nc}
6776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6777     } else {
6778     $self->{set_nc}->($self);
6779     }
6780    
6781     return ($self->{ct});
6782     redo A;
6783     } else {
6784     ## XML5: Switch to the "DOCTYPE bogus comment state".
6785     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6786     $self->{ca}->{value} = '';
6787     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6788     ## Reconsume.
6789     redo A;
6790     }
6791     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6792     if ($is_space->{$self->{nc}}) {
6793     ## Stay in the state.
6794    
6795     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6796     $self->{line_prev} = $self->{line};
6797     $self->{column_prev} = $self->{column};
6798     $self->{column}++;
6799     $self->{nc}
6800     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6801     } else {
6802     $self->{set_nc}->($self);
6803     }
6804    
6805     redo A;
6806     } elsif ($self->{nc} == 0x007C) { # |
6807     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6808     ## Stay in the state.
6809    
6810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6811     $self->{line_prev} = $self->{line};
6812     $self->{column_prev} = $self->{column};
6813     $self->{column}++;
6814     $self->{nc}
6815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6816     } else {
6817     $self->{set_nc}->($self);
6818     }
6819    
6820     redo A;
6821     } elsif ($self->{nc} == 0x0029) { # )
6822     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6823     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6824    
6825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826     $self->{line_prev} = $self->{line};
6827     $self->{column_prev} = $self->{column};
6828     $self->{column}++;
6829     $self->{nc}
6830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831     } else {
6832     $self->{set_nc}->($self);
6833     }
6834    
6835     redo A;
6836     } elsif ($self->{nc} == 0x003E) { # >
6837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6838     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6839    
6840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6841     $self->{line_prev} = $self->{line};
6842     $self->{column_prev} = $self->{column};
6843     $self->{column}++;
6844     $self->{nc}
6845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6846     } else {
6847     $self->{set_nc}->($self);
6848     }
6849    
6850     return ($self->{ct}); # ATTLIST
6851     redo A;
6852     } elsif ($self->{nc} == -1) {
6853     ## XML5: No parse error.
6854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6856    
6857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6858     $self->{line_prev} = $self->{line};
6859     $self->{column_prev} = $self->{column};
6860     $self->{column}++;
6861     $self->{nc}
6862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6863     } else {
6864     $self->{set_nc}->($self);
6865     }
6866    
6867     return ($self->{ct});
6868     redo A;
6869     } else {
6870     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6871     $self->{state} = ALLOWED_TOKEN_STATE;
6872    
6873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6874     $self->{line_prev} = $self->{line};
6875     $self->{column_prev} = $self->{column};
6876     $self->{column}++;
6877     $self->{nc}
6878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6879     } else {
6880     $self->{set_nc}->($self);
6881     }
6882    
6883     redo A;
6884     }
6885     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6886     if ($is_space->{$self->{nc}}) {
6887     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6888    
6889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6890     $self->{line_prev} = $self->{line};
6891     $self->{column_prev} = $self->{column};
6892     $self->{column}++;
6893     $self->{nc}
6894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6895     } else {
6896     $self->{set_nc}->($self);
6897     }
6898    
6899     redo A;
6900     } elsif ($self->{nc} == 0x007C) { # |
6901     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6902    
6903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904     $self->{line_prev} = $self->{line};
6905     $self->{column_prev} = $self->{column};
6906     $self->{column}++;
6907     $self->{nc}
6908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909     } else {
6910     $self->{set_nc}->($self);
6911     }
6912    
6913     redo A;
6914     } elsif ($self->{nc} == 0x0029) { # )
6915     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6916    
6917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6918     $self->{line_prev} = $self->{line};
6919     $self->{column_prev} = $self->{column};
6920     $self->{column}++;
6921     $self->{nc}
6922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6923     } else {
6924     $self->{set_nc}->($self);
6925     }
6926    
6927     redo A;
6928     } elsif ($self->{nc} == 0x003E) { # >
6929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6930     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6931    
6932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6933     $self->{line_prev} = $self->{line};
6934     $self->{column_prev} = $self->{column};
6935     $self->{column}++;
6936     $self->{nc}
6937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6938     } else {
6939     $self->{set_nc}->($self);
6940     }
6941    
6942     return ($self->{ct}); # ATTLIST
6943     redo A;
6944     } elsif ($self->{nc} == -1) {
6945     ## XML5: No parse error.
6946     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6947     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6948    
6949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6950     $self->{line_prev} = $self->{line};
6951     $self->{column_prev} = $self->{column};
6952     $self->{column}++;
6953     $self->{nc}
6954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6955     } else {
6956     $self->{set_nc}->($self);
6957     }
6958    
6959     return ($self->{ct});
6960     redo A;
6961     } else {
6962     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6963     ## Stay in the state.
6964    
6965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966     $self->{line_prev} = $self->{line};
6967     $self->{column_prev} = $self->{column};
6968     $self->{column}++;
6969     $self->{nc}
6970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971     } else {
6972     $self->{set_nc}->($self);
6973     }
6974    
6975     redo A;
6976     }
6977     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6978     if ($is_space->{$self->{nc}}) {
6979     ## Stay in the state.
6980    
6981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6982     $self->{line_prev} = $self->{line};
6983     $self->{column_prev} = $self->{column};
6984     $self->{column}++;
6985     $self->{nc}
6986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6987     } else {
6988     $self->{set_nc}->($self);
6989     }
6990    
6991     redo A;
6992     } elsif ($self->{nc} == 0x007C) { # |
6993     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6994    
6995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6996     $self->{line_prev} = $self->{line};
6997     $self->{column_prev} = $self->{column};
6998     $self->{column}++;
6999     $self->{nc}
7000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7001     } else {
7002     $self->{set_nc}->($self);
7003     }
7004    
7005     redo A;
7006     } elsif ($self->{nc} == 0x0029) { # )
7007     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7008    
7009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010     $self->{line_prev} = $self->{line};
7011     $self->{column_prev} = $self->{column};
7012     $self->{column}++;
7013     $self->{nc}
7014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015     } else {
7016     $self->{set_nc}->($self);
7017     }
7018    
7019     redo A;
7020     } elsif ($self->{nc} == 0x003E) { # >
7021     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7022     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7023    
7024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7025     $self->{line_prev} = $self->{line};
7026     $self->{column_prev} = $self->{column};
7027     $self->{column}++;
7028     $self->{nc}
7029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7030     } else {
7031     $self->{set_nc}->($self);
7032     }
7033    
7034     return ($self->{ct}); # ATTLIST
7035     redo A;
7036     } elsif ($self->{nc} == -1) {
7037     ## XML5: No parse error.
7038     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7039     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7040    
7041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7042     $self->{line_prev} = $self->{line};
7043     $self->{column_prev} = $self->{column};
7044     $self->{column}++;
7045     $self->{nc}
7046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7047     } else {
7048     $self->{set_nc}->($self);
7049     }
7050    
7051     return ($self->{ct});
7052     redo A;
7053     } else {
7054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7055     line => $self->{line_prev},
7056     column => $self->{column_prev});
7057     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7058     $self->{state} = ALLOWED_TOKEN_STATE;
7059    
7060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7061     $self->{line_prev} = $self->{line};
7062     $self->{column_prev} = $self->{column};
7063     $self->{column}++;
7064     $self->{nc}
7065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7066     } else {
7067     $self->{set_nc}->($self);
7068     }
7069    
7070     redo A;
7071     }
7072     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7073     if ($is_space->{$self->{nc}}) {
7074     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7075    
7076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077     $self->{line_prev} = $self->{line};
7078     $self->{column_prev} = $self->{column};
7079     $self->{column}++;
7080     $self->{nc}
7081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082     } else {
7083     $self->{set_nc}->($self);
7084     }
7085    
7086     redo A;
7087     } elsif ($self->{nc} == 0x0023) { # #
7088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7089     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7090    
7091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092     $self->{line_prev} = $self->{line};
7093     $self->{column_prev} = $self->{column};
7094     $self->{column}++;
7095     $self->{nc}
7096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097     } else {
7098     $self->{set_nc}->($self);
7099     }
7100    
7101     redo A;
7102     } elsif ($self->{nc} == 0x0022) { # "
7103     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7104     $self->{ca}->{value} = '';
7105     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7106    
7107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7108     $self->{line_prev} = $self->{line};
7109     $self->{column_prev} = $self->{column};
7110     $self->{column}++;
7111     $self->{nc}
7112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7113     } else {
7114     $self->{set_nc}->($self);
7115     }
7116    
7117     redo A;
7118     } elsif ($self->{nc} == 0x0027) { # '
7119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7120     $self->{ca}->{value} = '';
7121     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7122    
7123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7124     $self->{line_prev} = $self->{line};
7125     $self->{column_prev} = $self->{column};
7126     $self->{column}++;
7127     $self->{nc}
7128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7129     } else {
7130     $self->{set_nc}->($self);
7131     }
7132    
7133     redo A;
7134     } elsif ($self->{nc} == 0x003E) { # >
7135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7137    
7138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7139     $self->{line_prev} = $self->{line};
7140     $self->{column_prev} = $self->{column};
7141     $self->{column}++;
7142     $self->{nc}
7143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7144     } else {
7145     $self->{set_nc}->($self);
7146     }
7147    
7148     return ($self->{ct}); # ATTLIST
7149     redo A;
7150     } elsif ($self->{nc} == -1) {
7151     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7152     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7153    
7154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7155     $self->{line_prev} = $self->{line};
7156     $self->{column_prev} = $self->{column};
7157     $self->{column}++;
7158     $self->{nc}
7159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7160     } else {
7161     $self->{set_nc}->($self);
7162     }
7163    
7164     return ($self->{ct});
7165     redo A;
7166     } else {
7167     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7168     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7169     ## Reconsume.
7170     redo A;
7171     }
7172     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7173     if ($is_space->{$self->{nc}}) {
7174     ## Stay in the state.
7175    
7176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177     $self->{line_prev} = $self->{line};
7178     $self->{column_prev} = $self->{column};
7179     $self->{column}++;
7180     $self->{nc}
7181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182     } else {
7183     $self->{set_nc}->($self);
7184     }
7185    
7186     redo A;
7187     } elsif ($self->{nc} == 0x0023) { # #
7188     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7189    
7190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7191     $self->{line_prev} = $self->{line};
7192     $self->{column_prev} = $self->{column};
7193     $self->{column}++;
7194     $self->{nc}
7195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7196     } else {
7197     $self->{set_nc}->($self);
7198     }
7199    
7200     redo A;
7201     } elsif ($self->{nc} == 0x0022) { # "
7202     $self->{ca}->{value} = '';
7203     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7204    
7205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7206     $self->{line_prev} = $self->{line};
7207     $self->{column_prev} = $self->{column};
7208     $self->{column}++;
7209     $self->{nc}
7210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7211     } else {
7212     $self->{set_nc}->($self);
7213     }
7214    
7215     redo A;
7216     } elsif ($self->{nc} == 0x0027) { # '
7217     $self->{ca}->{value} = '';
7218     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7219    
7220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221     $self->{line_prev} = $self->{line};
7222     $self->{column_prev} = $self->{column};
7223     $self->{column}++;
7224     $self->{nc}
7225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226     } else {
7227     $self->{set_nc}->($self);
7228     }
7229    
7230     redo A;
7231     } elsif ($self->{nc} == 0x003E) { # >
7232     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7234    
7235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236     $self->{line_prev} = $self->{line};
7237     $self->{column_prev} = $self->{column};
7238     $self->{column}++;
7239     $self->{nc}
7240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241     } else {
7242     $self->{set_nc}->($self);
7243     }
7244    
7245     return ($self->{ct}); # ATTLIST
7246     redo A;
7247     } elsif ($self->{nc} == -1) {
7248     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7249     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7250    
7251     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7252     $self->{line_prev} = $self->{line};
7253     $self->{column_prev} = $self->{column};
7254     $self->{column}++;
7255     $self->{nc}
7256     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7257     } else {
7258     $self->{set_nc}->($self);
7259     }
7260    
7261     return ($self->{ct});
7262     redo A;
7263     } else {
7264     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7265     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7266     ## Reconsume.
7267     redo A;
7268     }
7269     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7270     if ($is_space->{$self->{nc}}) {
7271     ## XML5: No parse error.
7272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7273 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7274 wakaba 1.15 ## Reconsume.
7275     redo A;
7276     } elsif ($self->{nc} == 0x0022) { # "
7277     ## XML5: Same as "anything else".
7278     $self->{ca}->{value} = '';
7279     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7280    
7281     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7282     $self->{line_prev} = $self->{line};
7283     $self->{column_prev} = $self->{column};
7284     $self->{column}++;
7285     $self->{nc}
7286     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7287     } else {
7288     $self->{set_nc}->($self);
7289     }
7290    
7291     redo A;
7292     } elsif ($self->{nc} == 0x0027) { # '
7293     ## XML5: Same as "anything else".
7294     $self->{ca}->{value} = '';
7295     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7296    
7297     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7298     $self->{line_prev} = $self->{line};
7299     $self->{column_prev} = $self->{column};
7300     $self->{column}++;
7301     $self->{nc}
7302     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7303     } else {
7304     $self->{set_nc}->($self);
7305     }
7306    
7307     redo A;
7308     } elsif ($self->{nc} == 0x003E) { # >
7309     ## XML5: Same as "anything else".
7310     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7311     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7312    
7313     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7314     $self->{line_prev} = $self->{line};
7315     $self->{column_prev} = $self->{column};
7316     $self->{column}++;
7317     $self->{nc}
7318     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7319     } else {
7320     $self->{set_nc}->($self);
7321     }
7322    
7323     return ($self->{ct}); # ATTLIST
7324     redo A;
7325     } elsif ($self->{nc} == -1) {
7326     ## XML5: No parse error.
7327     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7328     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7329    
7330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7331     $self->{line_prev} = $self->{line};
7332     $self->{column_prev} = $self->{column};
7333     $self->{column}++;
7334     $self->{nc}
7335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7336     } else {
7337     $self->{set_nc}->($self);
7338     }
7339    
7340     return ($self->{ct});
7341     redo A;
7342     } else {
7343     $self->{ca}->{default} = chr $self->{nc};
7344     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7345    
7346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7347     $self->{line_prev} = $self->{line};
7348     $self->{column_prev} = $self->{column};
7349     $self->{column}++;
7350     $self->{nc}
7351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7352     } else {
7353     $self->{set_nc}->($self);
7354     }
7355    
7356     redo A;
7357     }
7358     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7359     if ($is_space->{$self->{nc}}) {
7360     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7361    
7362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7363     $self->{line_prev} = $self->{line};
7364     $self->{column_prev} = $self->{column};
7365     $self->{column}++;
7366     $self->{nc}
7367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7368     } else {
7369     $self->{set_nc}->($self);
7370     }
7371    
7372     redo A;
7373     } elsif ($self->{nc} == 0x0022) { # "
7374     ## XML5: Same as "anything else".
7375     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7376     $self->{ca}->{value} = '';
7377     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7378    
7379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7380     $self->{line_prev} = $self->{line};
7381     $self->{column_prev} = $self->{column};
7382     $self->{column}++;
7383     $self->{nc}
7384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7385     } else {
7386     $self->{set_nc}->($self);
7387     }
7388    
7389     redo A;
7390     } elsif ($self->{nc} == 0x0027) { # '
7391     ## XML5: Same as "anything else".
7392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7393     $self->{ca}->{value} = '';
7394     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7395    
7396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7397     $self->{line_prev} = $self->{line};
7398     $self->{column_prev} = $self->{column};
7399     $self->{column}++;
7400     $self->{nc}
7401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7402     } else {
7403     $self->{set_nc}->($self);
7404     }
7405    
7406     redo A;
7407     } elsif ($self->{nc} == 0x003E) { # >
7408     ## XML5: Same as "anything else".
7409     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7410     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7411    
7412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7413     $self->{line_prev} = $self->{line};
7414     $self->{column_prev} = $self->{column};
7415     $self->{column}++;
7416     $self->{nc}
7417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7418     } else {
7419     $self->{set_nc}->($self);
7420     }
7421    
7422     return ($self->{ct}); # ATTLIST
7423     redo A;
7424     } elsif ($self->{nc} == -1) {
7425     ## XML5: No parse error.
7426     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7427     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7428     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7429    
7430     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7431     $self->{line_prev} = $self->{line};
7432     $self->{column_prev} = $self->{column};
7433     $self->{column}++;
7434     $self->{nc}
7435     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7436     } else {
7437     $self->{set_nc}->($self);
7438     }
7439    
7440     return ($self->{ct});
7441     redo A;
7442     } else {
7443     $self->{ca}->{default} .= chr $self->{nc};
7444     ## Stay in the state.
7445    
7446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447     $self->{line_prev} = $self->{line};
7448     $self->{column_prev} = $self->{column};
7449     $self->{column}++;
7450     $self->{nc}
7451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452     } else {
7453     $self->{set_nc}->($self);
7454     }
7455    
7456     redo A;
7457     }
7458     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7459     if ($is_space->{$self->{nc}}) {
7460     ## Stay in the state.
7461    
7462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463     $self->{line_prev} = $self->{line};
7464     $self->{column_prev} = $self->{column};
7465     $self->{column}++;
7466     $self->{nc}
7467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468     } else {
7469     $self->{set_nc}->($self);
7470     }
7471    
7472     redo A;
7473     } elsif ($self->{nc} == 0x0022) { # "
7474     $self->{ca}->{value} = '';
7475     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7476    
7477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7478     $self->{line_prev} = $self->{line};
7479     $self->{column_prev} = $self->{column};
7480     $self->{column}++;
7481     $self->{nc}
7482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7483     } else {
7484     $self->{set_nc}->($self);
7485     }
7486    
7487     redo A;
7488     } elsif ($self->{nc} == 0x0027) { # '
7489     $self->{ca}->{value} = '';
7490     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7491    
7492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493     $self->{line_prev} = $self->{line};
7494     $self->{column_prev} = $self->{column};
7495     $self->{column}++;
7496     $self->{nc}
7497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498     } else {
7499     $self->{set_nc}->($self);
7500     }
7501    
7502     redo A;
7503     } elsif ($self->{nc} == 0x003E) { # >
7504     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7505     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7506    
7507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7508     $self->{line_prev} = $self->{line};
7509     $self->{column_prev} = $self->{column};
7510     $self->{column}++;
7511     $self->{nc}
7512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7513     } else {
7514     $self->{set_nc}->($self);
7515     }
7516    
7517     return ($self->{ct}); # ATTLIST
7518     redo A;
7519     } elsif ($self->{nc} == -1) {
7520     ## XML5: No parse error.
7521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7522     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7523     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7524    
7525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7526     $self->{line_prev} = $self->{line};
7527     $self->{column_prev} = $self->{column};
7528     $self->{column}++;
7529     $self->{nc}
7530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7531     } else {
7532     $self->{set_nc}->($self);
7533     }
7534    
7535     return ($self->{ct});
7536     redo A;
7537     } else {
7538     ## XML5: Not defined yet.
7539     if ($self->{ca}->{default} eq 'FIXED') {
7540     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7541     } else {
7542     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7543     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7544     }
7545     ## Reconsume.
7546     redo A;
7547     }
7548     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7549     if ($is_space->{$self->{nc}} or
7550     $self->{nc} == -1 or
7551     $self->{nc} == 0x003E) { # >
7552     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7553     ## Reconsume.
7554     redo A;
7555     } else {
7556     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7557     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7558     ## Reconsume.
7559     redo A;
7560 wakaba 1.16 }
7561 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7562     ## ASCII case-insensitive
7563     if ($self->{nc} == [
7564     undef,
7565     0x0044, # D
7566     0x0041, # A
7567     0x0054, # T
7568     ]->[length $self->{kwd}] or
7569     $self->{nc} == [
7570     undef,
7571     0x0064, # d
7572     0x0061, # a
7573     0x0074, # t
7574     ]->[length $self->{kwd}]) {
7575    
7576     ## Stay in the state.
7577     $self->{kwd} .= chr $self->{nc};
7578    
7579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7580     $self->{line_prev} = $self->{line};
7581     $self->{column_prev} = $self->{column};
7582     $self->{column}++;
7583     $self->{nc}
7584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7585     } else {
7586     $self->{set_nc}->($self);
7587     }
7588    
7589     redo A;
7590     } elsif ((length $self->{kwd}) == 4 and
7591     ($self->{nc} == 0x0041 or # A
7592     $self->{nc} == 0x0061)) { # a
7593     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7594    
7595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7596     text => 'NDATA',
7597     line => $self->{line_prev},
7598     column => $self->{column_prev} - 4);
7599     } else {
7600    
7601     }
7602     $self->{state} = AFTER_NDATA_STATE;
7603    
7604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7605     $self->{line_prev} = $self->{line};
7606     $self->{column_prev} = $self->{column};
7607     $self->{column}++;
7608     $self->{nc}
7609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7610     } else {
7611     $self->{set_nc}->($self);
7612     }
7613    
7614     redo A;
7615     } else {
7616     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7617     line => $self->{line_prev},
7618     column => $self->{column_prev} + 1
7619     - length $self->{kwd});
7620    
7621     $self->{state} = BOGUS_MD_STATE;
7622     ## Reconsume.
7623     redo A;
7624     }
7625     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7626     if ($is_space->{$self->{nc}}) {
7627     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7628    
7629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7630     $self->{line_prev} = $self->{line};
7631     $self->{column_prev} = $self->{column};
7632     $self->{column}++;
7633     $self->{nc}
7634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7635     } else {
7636     $self->{set_nc}->($self);
7637     }
7638    
7639     redo A;
7640     } elsif ($self->{nc} == 0x003E) { # >
7641     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7643    
7644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7645     $self->{line_prev} = $self->{line};
7646     $self->{column_prev} = $self->{column};
7647     $self->{column}++;
7648     $self->{nc}
7649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7650     } else {
7651     $self->{set_nc}->($self);
7652     }
7653    
7654     return ($self->{ct}); # ENTITY
7655     redo A;
7656     } elsif ($self->{nc} == -1) {
7657     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7658     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7659    
7660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7661     $self->{line_prev} = $self->{line};
7662     $self->{column_prev} = $self->{column};
7663     $self->{column}++;
7664     $self->{nc}
7665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7666     } else {
7667     $self->{set_nc}->($self);
7668     }
7669    
7670     return ($self->{ct}); # ENTITY
7671     redo A;
7672     } else {
7673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7674     line => $self->{line_prev},
7675     column => $self->{column_prev} + 1
7676     - length $self->{kwd});
7677     $self->{state} = BOGUS_MD_STATE;
7678     ## Reconsume.
7679     redo A;
7680     }
7681     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7682     if ($is_space->{$self->{nc}}) {
7683     ## Stay in the state.
7684    
7685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7686     $self->{line_prev} = $self->{line};
7687     $self->{column_prev} = $self->{column};
7688     $self->{column}++;
7689     $self->{nc}
7690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7691     } else {
7692     $self->{set_nc}->($self);
7693     }
7694    
7695     redo A;
7696     } elsif ($self->{nc} == 0x003E) { # >
7697     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7698     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7699    
7700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7701     $self->{line_prev} = $self->{line};
7702     $self->{column_prev} = $self->{column};
7703     $self->{column}++;
7704     $self->{nc}
7705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7706     } else {
7707     $self->{set_nc}->($self);
7708     }
7709    
7710     return ($self->{ct}); # ENTITY
7711     redo A;
7712     } elsif ($self->{nc} == -1) {
7713     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7714     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7715    
7716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7717     $self->{line_prev} = $self->{line};
7718     $self->{column_prev} = $self->{column};
7719     $self->{column}++;
7720     $self->{nc}
7721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7722     } else {
7723     $self->{set_nc}->($self);
7724     }
7725    
7726     return ($self->{ct}); # ENTITY
7727     redo A;
7728     } else {
7729     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7730     $self->{state} = NOTATION_NAME_STATE;
7731    
7732     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7733     $self->{line_prev} = $self->{line};
7734     $self->{column_prev} = $self->{column};
7735     $self->{column}++;
7736     $self->{nc}
7737     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7738     } else {
7739     $self->{set_nc}->($self);
7740     }
7741    
7742     redo A;
7743     }
7744     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7745     if ($is_space->{$self->{nc}}) {
7746 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7747 wakaba 1.18
7748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7749     $self->{line_prev} = $self->{line};
7750     $self->{column_prev} = $self->{column};
7751     $self->{column}++;
7752     $self->{nc}
7753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7754     } else {
7755     $self->{set_nc}->($self);
7756     }
7757    
7758     redo A;
7759     } elsif ($self->{nc} == 0x003E) { # >
7760     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7761    
7762     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7763     $self->{line_prev} = $self->{line};
7764     $self->{column_prev} = $self->{column};
7765     $self->{column}++;
7766     $self->{nc}
7767     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7768     } else {
7769     $self->{set_nc}->($self);
7770     }
7771    
7772     return ($self->{ct}); # ENTITY
7773     redo A;
7774     } elsif ($self->{nc} == -1) {
7775     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7776     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7777    
7778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7779     $self->{line_prev} = $self->{line};
7780     $self->{column_prev} = $self->{column};
7781     $self->{column}++;
7782     $self->{nc}
7783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7784     } else {
7785     $self->{set_nc}->($self);
7786     }
7787    
7788     return ($self->{ct}); # ENTITY
7789     redo A;
7790     } else {
7791     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7792     ## Stay in the state.
7793    
7794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7795     $self->{line_prev} = $self->{line};
7796     $self->{column_prev} = $self->{column};
7797     $self->{column}++;
7798     $self->{nc}
7799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7800     } else {
7801     $self->{set_nc}->($self);
7802     }
7803    
7804     redo A;
7805     }
7806 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7807     if ($self->{nc} == 0x0022) { # "
7808 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7809 wakaba 1.19
7810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7811     $self->{line_prev} = $self->{line};
7812     $self->{column_prev} = $self->{column};
7813     $self->{column}++;
7814     $self->{nc}
7815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7816     } else {
7817     $self->{set_nc}->($self);
7818     }
7819    
7820     redo A;
7821     } elsif ($self->{nc} == 0x0026) { # &
7822     $self->{prev_state} = $self->{state};
7823     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7824     $self->{entity_add} = 0x0022; # "
7825    
7826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827     $self->{line_prev} = $self->{line};
7828     $self->{column_prev} = $self->{column};
7829     $self->{column}++;
7830     $self->{nc}
7831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832     } else {
7833     $self->{set_nc}->($self);
7834     }
7835    
7836     redo A;
7837     ## TODO: %
7838     } elsif ($self->{nc} == -1) {
7839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7840     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7841     ## Reconsume.
7842     return ($self->{ct}); # ENTITY
7843     redo A;
7844     } else {
7845     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7846    
7847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7848     $self->{line_prev} = $self->{line};
7849     $self->{column_prev} = $self->{column};
7850     $self->{column}++;
7851     $self->{nc}
7852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7853     } else {
7854     $self->{set_nc}->($self);
7855     }
7856    
7857     redo A;
7858     }
7859     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7860     if ($self->{nc} == 0x0027) { # '
7861 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7862 wakaba 1.19
7863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7864     $self->{line_prev} = $self->{line};
7865     $self->{column_prev} = $self->{column};
7866     $self->{column}++;
7867     $self->{nc}
7868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7869     } else {
7870     $self->{set_nc}->($self);
7871     }
7872    
7873     redo A;
7874     } elsif ($self->{nc} == 0x0026) { # &
7875     $self->{prev_state} = $self->{state};
7876     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7877     $self->{entity_add} = 0x0027; # '
7878    
7879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7880     $self->{line_prev} = $self->{line};
7881     $self->{column_prev} = $self->{column};
7882     $self->{column}++;
7883     $self->{nc}
7884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7885     } else {
7886     $self->{set_nc}->($self);
7887     }
7888    
7889     redo A;
7890     ## TODO: %
7891     } elsif ($self->{nc} == -1) {
7892     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7893     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7894     ## Reconsume.
7895     return ($self->{ct}); # ENTITY
7896     redo A;
7897     } else {
7898     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7899    
7900     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7901     $self->{line_prev} = $self->{line};
7902     $self->{column_prev} = $self->{column};
7903     $self->{column}++;
7904     $self->{nc}
7905     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7906     } else {
7907     $self->{set_nc}->($self);
7908     }
7909    
7910     redo A;
7911     }
7912     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7913     if ($is_space->{$self->{nc}} or
7914     {
7915     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7916     $self->{entity_add} => 1,
7917     }->{$self->{nc}}) {
7918 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7919     line => $self->{line_prev},
7920     column => $self->{column_prev}
7921     + ($self->{nc} == -1 ? 1 : 0));
7922 wakaba 1.19 ## Don't consume
7923     ## Return nothing.
7924     #
7925     } elsif ($self->{nc} == 0x0023) { # #
7926     $self->{ca} = $self->{ct};
7927     $self->{state} = ENTITY_HASH_STATE;
7928     $self->{kwd} = '#';
7929    
7930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7931     $self->{line_prev} = $self->{line};
7932     $self->{column_prev} = $self->{column};
7933     $self->{column}++;
7934     $self->{nc}
7935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7936     } else {
7937     $self->{set_nc}->($self);
7938     }
7939    
7940     redo A;
7941     } else {
7942     #
7943     }
7944    
7945     $self->{ct}->{value} .= '&';
7946     $self->{state} = $self->{prev_state};
7947     ## Reconsume.
7948     redo A;
7949 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7950     if ($is_space->{$self->{nc}}) {
7951     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7952    
7953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7954     $self->{line_prev} = $self->{line};
7955     $self->{column_prev} = $self->{column};
7956     $self->{column}++;
7957     $self->{nc}
7958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7959     } else {
7960     $self->{set_nc}->($self);
7961     }
7962    
7963     redo A;
7964     } elsif ($self->{nc} == 0x0028) { # (
7965     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7966     $self->{ct}->{content} = ['('];
7967     $self->{group_depth} = 1;
7968    
7969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7970     $self->{line_prev} = $self->{line};
7971     $self->{column_prev} = $self->{column};
7972     $self->{column}++;
7973     $self->{nc}
7974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7975     } else {
7976     $self->{set_nc}->($self);
7977     }
7978    
7979     redo A;
7980     } elsif ($self->{nc} == 0x003E) { # >
7981     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7982     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7983    
7984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7985     $self->{line_prev} = $self->{line};
7986     $self->{column_prev} = $self->{column};
7987     $self->{column}++;
7988     $self->{nc}
7989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7990     } else {
7991     $self->{set_nc}->($self);
7992     }
7993    
7994     return ($self->{ct}); # ELEMENT
7995     redo A;
7996     } elsif ($self->{nc} == -1) {
7997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7999    
8000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8001     $self->{line_prev} = $self->{line};
8002     $self->{column_prev} = $self->{column};
8003     $self->{column}++;
8004     $self->{nc}
8005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8006     } else {
8007     $self->{set_nc}->($self);
8008     }
8009    
8010     return ($self->{ct}); # ELEMENT
8011     redo A;
8012     } else {
8013     $self->{ct}->{content} = [chr $self->{nc}];
8014     $self->{state} = CONTENT_KEYWORD_STATE;
8015    
8016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8017     $self->{line_prev} = $self->{line};
8018     $self->{column_prev} = $self->{column};
8019     $self->{column}++;
8020     $self->{nc}
8021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8022     } else {
8023     $self->{set_nc}->($self);
8024     }
8025    
8026     redo A;
8027     }
8028     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8029     if ($is_space->{$self->{nc}}) {
8030     $self->{state} = AFTER_MD_DEF_STATE;
8031    
8032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8033     $self->{line_prev} = $self->{line};
8034     $self->{column_prev} = $self->{column};
8035     $self->{column}++;
8036     $self->{nc}
8037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8038     } else {
8039     $self->{set_nc}->($self);
8040     }
8041    
8042     redo A;
8043     } elsif ($self->{nc} == 0x003E) { # >
8044     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8045    
8046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047     $self->{line_prev} = $self->{line};
8048     $self->{column_prev} = $self->{column};
8049     $self->{column}++;
8050     $self->{nc}
8051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052     } else {
8053     $self->{set_nc}->($self);
8054     }
8055    
8056     return ($self->{ct}); # ELEMENT
8057     redo A;
8058     } elsif ($self->{nc} == -1) {
8059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8061    
8062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8063     $self->{line_prev} = $self->{line};
8064     $self->{column_prev} = $self->{column};
8065     $self->{column}++;
8066     $self->{nc}
8067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8068     } else {
8069     $self->{set_nc}->($self);
8070     }
8071    
8072     return ($self->{ct}); # ELEMENT
8073     redo A;
8074     } else {
8075     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8076     ## Stay in the state.
8077    
8078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8079     $self->{line_prev} = $self->{line};
8080     $self->{column_prev} = $self->{column};
8081     $self->{column}++;
8082     $self->{nc}
8083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8084     } else {
8085     $self->{set_nc}->($self);
8086     }
8087    
8088     redo A;
8089     }
8090     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8091     if ($is_space->{$self->{nc}}) {
8092     ## Stay in the state.
8093    
8094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095     $self->{line_prev} = $self->{line};
8096     $self->{column_prev} = $self->{column};
8097     $self->{column}++;
8098     $self->{nc}
8099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100     } else {
8101     $self->{set_nc}->($self);
8102     }
8103    
8104     redo A;
8105     } elsif ($self->{nc} == 0x0028) { # (
8106     $self->{group_depth}++;
8107     push @{$self->{ct}->{content}}, chr $self->{nc};
8108     ## Stay in the state.
8109    
8110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111     $self->{line_prev} = $self->{line};
8112     $self->{column_prev} = $self->{column};
8113     $self->{column}++;
8114     $self->{nc}
8115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116     } else {
8117     $self->{set_nc}->($self);
8118     }
8119    
8120     redo A;
8121     } elsif ($self->{nc} == 0x007C or # |
8122     $self->{nc} == 0x002C) { # ,
8123     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8124     ## Stay in the state.
8125    
8126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127     $self->{line_prev} = $self->{line};
8128     $self->{column_prev} = $self->{column};
8129     $self->{column}++;
8130     $self->{nc}
8131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132     } else {
8133     $self->{set_nc}->($self);
8134     }
8135    
8136     redo A;
8137     } elsif ($self->{nc} == 0x0029) { # )
8138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8139     push @{$self->{ct}->{content}}, chr $self->{nc};
8140     $self->{group_depth}--;
8141     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8142    
8143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8144     $self->{line_prev} = $self->{line};
8145     $self->{column_prev} = $self->{column};
8146     $self->{column}++;
8147     $self->{nc}
8148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8149     } else {
8150     $self->{set_nc}->($self);
8151     }
8152    
8153     redo A;
8154     } elsif ($self->{nc} == 0x003E) { # >
8155     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8156     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8158    
8159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8160     $self->{line_prev} = $self->{line};
8161     $self->{column_prev} = $self->{column};
8162     $self->{column}++;
8163     $self->{nc}
8164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8165     } else {
8166     $self->{set_nc}->($self);
8167     }
8168    
8169     return ($self->{ct}); # ELEMENT
8170     redo A;
8171     } elsif ($self->{nc} == -1) {
8172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8173     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8175    
8176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8177     $self->{line_prev} = $self->{line};
8178     $self->{column_prev} = $self->{column};
8179     $self->{column}++;
8180     $self->{nc}
8181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8182     } else {
8183     $self->{set_nc}->($self);
8184     }
8185    
8186     return ($self->{ct}); # ELEMENT
8187     redo A;
8188     } else {
8189     push @{$self->{ct}->{content}}, chr $self->{nc};
8190     $self->{state} = CM_ELEMENT_NAME_STATE;
8191    
8192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8193     $self->{line_prev} = $self->{line};
8194     $self->{column_prev} = $self->{column};
8195     $self->{column}++;
8196     $self->{nc}
8197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8198     } else {
8199     $self->{set_nc}->($self);
8200     }
8201    
8202     redo A;
8203     }
8204     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8205     if ($is_space->{$self->{nc}}) {
8206     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8207    
8208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8209     $self->{line_prev} = $self->{line};
8210     $self->{column_prev} = $self->{column};
8211     $self->{column}++;
8212     $self->{nc}
8213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8214     } else {
8215     $self->{set_nc}->($self);
8216     }
8217    
8218     redo A;
8219     } elsif ($self->{nc} == 0x002A or # *
8220     $self->{nc} == 0x002B or # +
8221     $self->{nc} == 0x003F) { # ?
8222     push @{$self->{ct}->{content}}, chr $self->{nc};
8223     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8224    
8225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8226     $self->{line_prev} = $self->{line};
8227     $self->{column_prev} = $self->{column};
8228     $self->{column}++;
8229     $self->{nc}
8230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8231     } else {
8232     $self->{set_nc}->($self);
8233     }
8234    
8235     redo A;
8236     } elsif ($self->{nc} == 0x007C or # |
8237     $self->{nc} == 0x002C) { # ,
8238     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8239     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8240    
8241     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8242     $self->{line_prev} = $self->{line};
8243     $self->{column_prev} = $self->{column};
8244     $self->{column}++;
8245     $self->{nc}
8246     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8247     } else {
8248     $self->{set_nc}->($self);
8249     }
8250    
8251     redo A;
8252     } elsif ($self->{nc} == 0x0029) { # )
8253     $self->{group_depth}--;
8254     push @{$self->{ct}->{content}}, chr $self->{nc};
8255     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8256    
8257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8258     $self->{line_prev} = $self->{line};
8259     $self->{column_prev} = $self->{column};
8260     $self->{column}++;
8261     $self->{nc}
8262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8263     } else {
8264     $self->{set_nc}->($self);
8265     }
8266    
8267     redo A;
8268     } elsif ($self->{nc} == 0x003E) { # >
8269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8270     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272    
8273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274     $self->{line_prev} = $self->{line};
8275     $self->{column_prev} = $self->{column};
8276     $self->{column}++;
8277     $self->{nc}
8278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279     } else {
8280     $self->{set_nc}->($self);
8281     }
8282    
8283     return ($self->{ct}); # ELEMENT
8284     redo A;
8285     } elsif ($self->{nc} == -1) {
8286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8287     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8289    
8290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8291     $self->{line_prev} = $self->{line};
8292     $self->{column_prev} = $self->{column};
8293     $self->{column}++;
8294     $self->{nc}
8295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8296     } else {
8297     $self->{set_nc}->($self);
8298     }
8299    
8300     return ($self->{ct}); # ELEMENT
8301     redo A;
8302     } else {
8303     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8304     ## Stay in the state.
8305    
8306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8307     $self->{line_prev} = $self->{line};
8308     $self->{column_prev} = $self->{column};
8309     $self->{column}++;
8310     $self->{nc}
8311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8312     } else {
8313     $self->{set_nc}->($self);
8314     }
8315    
8316     redo A;
8317     }
8318     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8319     if ($is_space->{$self->{nc}}) {
8320     ## Stay in the state.
8321    
8322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8323     $self->{line_prev} = $self->{line};
8324     $self->{column_prev} = $self->{column};
8325     $self->{column}++;
8326     $self->{nc}
8327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8328     } else {
8329     $self->{set_nc}->($self);
8330     }
8331    
8332     redo A;
8333     } elsif ($self->{nc} == 0x007C or # |
8334     $self->{nc} == 0x002C) { # ,
8335     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8336     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8337    
8338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8339     $self->{line_prev} = $self->{line};
8340     $self->{column_prev} = $self->{column};
8341     $self->{column}++;
8342     $self->{nc}
8343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8344     } else {
8345     $self->{set_nc}->($self);
8346     }
8347    
8348     redo A;
8349     } elsif ($self->{nc} == 0x0029) { # )
8350     $self->{group_depth}--;
8351     push @{$self->{ct}->{content}}, chr $self->{nc};
8352     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8353    
8354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8355     $self->{line_prev} = $self->{line};
8356     $self->{column_prev} = $self->{column};
8357     $self->{column}++;
8358     $self->{nc}
8359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8360     } else {
8361     $self->{set_nc}->($self);
8362     }
8363    
8364     redo A;
8365     } elsif ($self->{nc} == 0x003E) { # >
8366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8367     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369    
8370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371     $self->{line_prev} = $self->{line};
8372     $self->{column_prev} = $self->{column};
8373     $self->{column}++;
8374     $self->{nc}
8375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376     } else {
8377     $self->{set_nc}->($self);
8378     }
8379    
8380     return ($self->{ct}); # ELEMENT
8381     redo A;
8382     } elsif ($self->{nc} == -1) {
8383     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8384     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8386    
8387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388     $self->{line_prev} = $self->{line};
8389     $self->{column_prev} = $self->{column};
8390     $self->{column}++;
8391     $self->{nc}
8392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393     } else {
8394     $self->{set_nc}->($self);
8395     }
8396    
8397     return ($self->{ct}); # ELEMENT
8398     redo A;
8399     } else {
8400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8401     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8402     $self->{state} = BOGUS_MD_STATE;
8403    
8404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8405     $self->{line_prev} = $self->{line};
8406     $self->{column_prev} = $self->{column};
8407     $self->{column}++;
8408     $self->{nc}
8409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8410     } else {
8411     $self->{set_nc}->($self);
8412     }
8413    
8414     redo A;
8415     }
8416     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8417     if ($is_space->{$self->{nc}}) {
8418     if ($self->{group_depth}) {
8419     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8420     } else {
8421     $self->{state} = AFTER_MD_DEF_STATE;
8422     }
8423    
8424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8425     $self->{line_prev} = $self->{line};
8426     $self->{column_prev} = $self->{column};
8427     $self->{column}++;
8428     $self->{nc}
8429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8430     } else {
8431     $self->{set_nc}->($self);
8432     }
8433    
8434     redo A;
8435     } elsif ($self->{nc} == 0x002A or # *
8436     $self->{nc} == 0x002B or # +
8437     $self->{nc} == 0x003F) { # ?
8438     push @{$self->{ct}->{content}}, chr $self->{nc};
8439     if ($self->{group_depth}) {
8440     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8441     } else {
8442     $self->{state} = AFTER_MD_DEF_STATE;
8443     }
8444    
8445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8446     $self->{line_prev} = $self->{line};
8447     $self->{column_prev} = $self->{column};
8448     $self->{column}++;
8449     $self->{nc}
8450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8451     } else {
8452     $self->{set_nc}->($self);
8453     }
8454    
8455     redo A;
8456     } elsif ($self->{nc} == 0x0029) { # )
8457     if ($self->{group_depth}) {
8458     $self->{group_depth}--;
8459     push @{$self->{ct}->{content}}, chr $self->{nc};
8460     ## Stay in the state.
8461    
8462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8463     $self->{line_prev} = $self->{line};
8464     $self->{column_prev} = $self->{column};
8465     $self->{column}++;
8466     $self->{nc}
8467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8468     } else {
8469     $self->{set_nc}->($self);
8470     }
8471    
8472     redo A;
8473     } else {
8474     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8475     $self->{state} = BOGUS_MD_STATE;
8476     ## Reconsume.
8477     redo A;
8478     }
8479     } elsif ($self->{nc} == 0x003E) { # >
8480     if ($self->{group_depth}) {
8481     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8482     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8483     }
8484     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485    
8486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487     $self->{line_prev} = $self->{line};
8488     $self->{column_prev} = $self->{column};
8489     $self->{column}++;
8490     $self->{nc}
8491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492     } else {
8493     $self->{set_nc}->($self);
8494     }
8495    
8496     return ($self->{ct}); # ELEMENT
8497     redo A;
8498     } elsif ($self->{nc} == -1) {
8499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8500     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8501     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8502    
8503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504     $self->{line_prev} = $self->{line};
8505     $self->{column_prev} = $self->{column};
8506     $self->{column}++;
8507     $self->{nc}
8508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509     } else {
8510     $self->{set_nc}->($self);
8511     }
8512    
8513     return ($self->{ct}); # ELEMENT
8514     redo A;
8515     } else {
8516     if ($self->{group_depth}) {
8517     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8518     } else {
8519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8520     $self->{state} = BOGUS_MD_STATE;
8521     }
8522     ## Reconsume.
8523     redo A;
8524     }
8525     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8526 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8527     ## Stay in the state.
8528    
8529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8530     $self->{line_prev} = $self->{line};
8531     $self->{column_prev} = $self->{column};
8532     $self->{column}++;
8533     $self->{nc}
8534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8535     } else {
8536     $self->{set_nc}->($self);
8537     }
8538    
8539     redo A;
8540     } elsif ($self->{nc} == 0x003E) { # >
8541     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8542    
8543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8544     $self->{line_prev} = $self->{line};
8545     $self->{column_prev} = $self->{column};
8546     $self->{column}++;
8547     $self->{nc}
8548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8549     } else {
8550     $self->{set_nc}->($self);
8551     }
8552    
8553 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8554 wakaba 1.18 redo A;
8555     } elsif ($self->{nc} == -1) {
8556     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8557     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8558    
8559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8560     $self->{line_prev} = $self->{line};
8561     $self->{column_prev} = $self->{column};
8562     $self->{column}++;
8563     $self->{nc}
8564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8565     } else {
8566     $self->{set_nc}->($self);
8567     }
8568    
8569 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8570 wakaba 1.18 redo A;
8571     } else {
8572 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8573 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8574     ## Reconsume.
8575     redo A;
8576     }
8577 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8578     if ($self->{nc} == 0x003E) { # >
8579     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580    
8581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582     $self->{line_prev} = $self->{line};
8583     $self->{column_prev} = $self->{column};
8584     $self->{column}++;
8585     $self->{nc}
8586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587     } else {
8588     $self->{set_nc}->($self);
8589     }
8590    
8591     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8592     redo A;
8593     } elsif ($self->{nc} == -1) {
8594     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8595     ## Reconsume.
8596     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8597     redo A;
8598     } else {
8599     ## Stay in the state.
8600    
8601     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602     $self->{line_prev} = $self->{line};
8603     $self->{column_prev} = $self->{column};
8604     $self->{column}++;
8605     $self->{nc}
8606     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607     } else {
8608     $self->{set_nc}->($self);
8609     }
8610    
8611     redo A;
8612     }
8613 wakaba 1.1 } else {
8614     die "$0: $self->{state}: Unknown state";
8615     }
8616     } # A
8617    
8618     die "$0: _get_next_token: unexpected case";
8619     } # _get_next_token
8620    
8621     1;
8622 wakaba 1.24 ## $Date: 2008/10/19 13:43:55 $
8623 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24