/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (hide annotations) (download)
Sun Oct 19 09:25:21 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.20: +63 -16 lines
++ whatpm/t/ChangeLog	19 Oct 2008 09:24:46 -0000
	* XML-Parser.t: "xml/entrefs-1.dat" added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 09:25:15 -0000
	* charrefs-1.dat: New test data added.

	* entrefs-1.dat: New test data file.

	* attlists-1.dat: Test results updated.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 09:23:24 -0000
	* Tokenizer.pm.src: Make uppercase "&#X" in XML a parse error.
	Remove the limitation of entity name length.  Enable replacement
	of text-only general entities.  Raise a parse error for an
	unparsed entity reference.  Raise a parse error for a general
	entity reference to an undefined entity.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 09:24:32 -0000
	* Parser.pm.src: Define predefined general entities for the
	control of "undeclared entity" error raised by the tokenizer.  Set
	text-only flag to general entities appropriately.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.21 our $VERSION=do{my @r=(q$Revision: 1.20 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743     if ($self->{nc} == 0x003D) { # =
1744    
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819     } elsif ($self->{nc} == -1) {
1820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822    
1823     $self->{last_stag_name} = $self->{ct}->{tag_name};
1824 wakaba 1.15
1825     $self->{state} = DATA_STATE;
1826     $self->{s_kwd} = '';
1827     ## reconsume
1828     return ($self->{ct}); # start tag
1829     redo A;
1830 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832     if ($self->{ct}->{attributes}) {
1833    
1834     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835     } else {
1836     ## NOTE: This state should never be reached.
1837    
1838     }
1839 wakaba 1.15
1840     $self->{state} = DATA_STATE;
1841     $self->{s_kwd} = '';
1842     ## reconsume
1843     return ($self->{ct}); # end tag
1844     redo A;
1845     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846     ## XML5: No parse error above; not defined yet.
1847     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849     ## Reconsume.
1850     return ($self->{ct}); # ATTLIST
1851     redo A;
1852 wakaba 1.1 } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     } else {
1856 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1857 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858    
1859     ## XML5: Not a parse error.
1860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861     } else {
1862    
1863     }
1864 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1865     $self->{read_until}->($self->{ca}->{value},
1866 wakaba 1.11 q["&<],
1867 wakaba 1.1 length $self->{ca}->{value});
1868    
1869     ## Stay in the state
1870    
1871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872     $self->{line_prev} = $self->{line};
1873     $self->{column_prev} = $self->{column};
1874     $self->{column}++;
1875     $self->{nc}
1876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877     } else {
1878     $self->{set_nc}->($self);
1879     }
1880    
1881     redo A;
1882     }
1883     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885     ## ATTLIST attribute value single quoted state".
1886 wakaba 1.11
1887 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1888 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889    
1890     ## XML5: "DOCTYPE ATTLIST name after state".
1891     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893     } else {
1894    
1895     ## XML5: "Before attribute name state" (sic).
1896     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897     }
1898 wakaba 1.1
1899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900     $self->{line_prev} = $self->{line};
1901     $self->{column_prev} = $self->{column};
1902     $self->{column}++;
1903     $self->{nc}
1904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905     } else {
1906     $self->{set_nc}->($self);
1907     }
1908    
1909     redo A;
1910     } elsif ($self->{nc} == 0x0026) { # &
1911    
1912 wakaba 1.11 ## XML5: Not defined yet.
1913    
1914 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1915     ## "entity in attribute value state". In this implementation, the
1916     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917     ## implementation of the "consume a character reference" algorithm.
1918     $self->{entity_add} = 0x0027; # '
1919     $self->{prev_state} = $self->{state};
1920     $self->{state} = ENTITY_STATE;
1921    
1922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923     $self->{line_prev} = $self->{line};
1924     $self->{column_prev} = $self->{column};
1925     $self->{column}++;
1926     $self->{nc}
1927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928     } else {
1929     $self->{set_nc}->($self);
1930     }
1931    
1932     redo A;
1933     } elsif ($self->{nc} == -1) {
1934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936    
1937     $self->{last_stag_name} = $self->{ct}->{tag_name};
1938 wakaba 1.15
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     ## reconsume
1942     return ($self->{ct}); # start tag
1943     redo A;
1944 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946     if ($self->{ct}->{attributes}) {
1947    
1948     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949     } else {
1950     ## NOTE: This state should never be reached.
1951    
1952     }
1953 wakaba 1.15
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     ## reconsume
1957     return ($self->{ct}); # end tag
1958     redo A;
1959     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960     ## XML5: No parse error above; not defined yet.
1961     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963     ## Reconsume.
1964     return ($self->{ct}); # ATTLIST
1965     redo A;
1966 wakaba 1.1 } else {
1967     die "$0: $self->{ct}->{type}: Unknown token type";
1968     }
1969     } else {
1970 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1971 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972    
1973     ## XML5: Not a parse error.
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975     } else {
1976    
1977     }
1978 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1979     $self->{read_until}->($self->{ca}->{value},
1980 wakaba 1.11 q['&<],
1981 wakaba 1.1 length $self->{ca}->{value});
1982    
1983     ## Stay in the state
1984    
1985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986     $self->{line_prev} = $self->{line};
1987     $self->{column_prev} = $self->{column};
1988     $self->{column}++;
1989     $self->{nc}
1990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991     } else {
1992     $self->{set_nc}->($self);
1993     }
1994    
1995     redo A;
1996     }
1997     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1999    
2000 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2001 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002    
2003     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005     } else {
2006    
2007     ## XML5: "Tag attribute name before state".
2008     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009     }
2010 wakaba 1.1
2011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012     $self->{line_prev} = $self->{line};
2013     $self->{column_prev} = $self->{column};
2014     $self->{column}++;
2015     $self->{nc}
2016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017     } else {
2018     $self->{set_nc}->($self);
2019     }
2020    
2021     redo A;
2022     } elsif ($self->{nc} == 0x0026) { # &
2023    
2024 wakaba 1.11
2025     ## XML5: Not defined yet.
2026    
2027 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2028     ## "entity in attribute value state". In this implementation, the
2029     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030     ## implementation of the "consume a character reference" algorithm.
2031     $self->{entity_add} = -1;
2032     $self->{prev_state} = $self->{state};
2033     $self->{state} = ENTITY_STATE;
2034    
2035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036     $self->{line_prev} = $self->{line};
2037     $self->{column_prev} = $self->{column};
2038     $self->{column}++;
2039     $self->{nc}
2040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041     } else {
2042     $self->{set_nc}->($self);
2043     }
2044    
2045     redo A;
2046     } elsif ($self->{nc} == 0x003E) { # >
2047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048    
2049     $self->{last_stag_name} = $self->{ct}->{tag_name};
2050 wakaba 1.15
2051     $self->{state} = DATA_STATE;
2052     $self->{s_kwd} = '';
2053    
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     return ($self->{ct}); # start tag
2065     redo A;
2066 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068     if ($self->{ct}->{attributes}) {
2069    
2070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071     } else {
2072     ## NOTE: This state should never be reached.
2073    
2074     }
2075 wakaba 1.15
2076     $self->{state} = DATA_STATE;
2077     $self->{s_kwd} = '';
2078    
2079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080     $self->{line_prev} = $self->{line};
2081     $self->{column_prev} = $self->{column};
2082     $self->{column}++;
2083     $self->{nc}
2084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085     } else {
2086     $self->{set_nc}->($self);
2087     }
2088    
2089     return ($self->{ct}); # end tag
2090     redo A;
2091     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094    
2095 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096     $self->{line_prev} = $self->{line};
2097     $self->{column_prev} = $self->{column};
2098     $self->{column}++;
2099     $self->{nc}
2100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101     } else {
2102     $self->{set_nc}->($self);
2103     }
2104    
2105 wakaba 1.15 return ($self->{ct}); # ATTLIST
2106     redo A;
2107     } else {
2108     die "$0: $self->{ct}->{type}: Unknown token type";
2109     }
2110 wakaba 1.1 } elsif ($self->{nc} == -1) {
2111     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112    
2113 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115 wakaba 1.15
2116     $self->{state} = DATA_STATE;
2117     $self->{s_kwd} = '';
2118     ## reconsume
2119     return ($self->{ct}); # start tag
2120     redo A;
2121 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124     if ($self->{ct}->{attributes}) {
2125    
2126     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127     } else {
2128     ## NOTE: This state should never be reached.
2129    
2130     }
2131 wakaba 1.15
2132     $self->{state} = DATA_STATE;
2133     $self->{s_kwd} = '';
2134     ## reconsume
2135     return ($self->{ct}); # end tag
2136     redo A;
2137     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141     ## Reconsume.
2142     return ($self->{ct}); # ATTLIST
2143     redo A;
2144 wakaba 1.1 } else {
2145     die "$0: $self->{ct}->{type}: Unknown token type";
2146     }
2147     } else {
2148     if ({
2149     0x0022 => 1, # "
2150     0x0027 => 1, # '
2151     0x003D => 1, # =
2152     }->{$self->{nc}}) {
2153    
2154 wakaba 1.11 ## XML5: Not a parse error.
2155 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156     } else {
2157    
2158     }
2159     $self->{ca}->{value} .= chr ($self->{nc});
2160     $self->{read_until}->($self->{ca}->{value},
2161     q["'=& >],
2162     length $self->{ca}->{value});
2163    
2164     ## Stay in the state
2165    
2166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167     $self->{line_prev} = $self->{line};
2168     $self->{column_prev} = $self->{column};
2169     $self->{column}++;
2170     $self->{nc}
2171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172     } else {
2173     $self->{set_nc}->($self);
2174     }
2175    
2176     redo A;
2177     }
2178     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179     if ($is_space->{$self->{nc}}) {
2180    
2181     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182    
2183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184     $self->{line_prev} = $self->{line};
2185     $self->{column_prev} = $self->{column};
2186     $self->{column}++;
2187     $self->{nc}
2188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189     } else {
2190     $self->{set_nc}->($self);
2191     }
2192    
2193     redo A;
2194     } elsif ($self->{nc} == 0x003E) { # >
2195     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196    
2197     $self->{last_stag_name} = $self->{ct}->{tag_name};
2198     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200     if ($self->{ct}->{attributes}) {
2201    
2202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203     } else {
2204     ## NOTE: This state should never be reached.
2205    
2206     }
2207     } else {
2208     die "$0: $self->{ct}->{type}: Unknown token type";
2209     }
2210     $self->{state} = DATA_STATE;
2211 wakaba 1.5 $self->{s_kwd} = '';
2212 wakaba 1.1
2213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214     $self->{line_prev} = $self->{line};
2215     $self->{column_prev} = $self->{column};
2216     $self->{column}++;
2217     $self->{nc}
2218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219     } else {
2220     $self->{set_nc}->($self);
2221     }
2222    
2223    
2224     return ($self->{ct}); # start tag or end tag
2225    
2226     redo A;
2227     } elsif ($self->{nc} == 0x002F) { # /
2228    
2229     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230    
2231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232     $self->{line_prev} = $self->{line};
2233     $self->{column_prev} = $self->{column};
2234     $self->{column}++;
2235     $self->{nc}
2236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237     } else {
2238     $self->{set_nc}->($self);
2239     }
2240    
2241     redo A;
2242     } elsif ($self->{nc} == -1) {
2243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245    
2246     $self->{last_stag_name} = $self->{ct}->{tag_name};
2247     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248     if ($self->{ct}->{attributes}) {
2249    
2250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251     } else {
2252     ## NOTE: This state should never be reached.
2253    
2254     }
2255     } else {
2256     die "$0: $self->{ct}->{type}: Unknown token type";
2257     }
2258     $self->{state} = DATA_STATE;
2259 wakaba 1.5 $self->{s_kwd} = '';
2260 wakaba 1.1 ## Reconsume.
2261     return ($self->{ct}); # start tag or end tag
2262     redo A;
2263     } else {
2264    
2265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267     ## reconsume
2268     redo A;
2269     }
2270     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 wakaba 1.11 ## XML5: "Empty tag state".
2272    
2273 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2274     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275    
2276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277     ## TODO: Different type than slash in start tag
2278     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279     if ($self->{ct}->{attributes}) {
2280    
2281     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282     } else {
2283    
2284     }
2285     ## TODO: Test |<title></title/>|
2286     } else {
2287    
2288     $self->{self_closing} = 1;
2289     }
2290    
2291     $self->{state} = DATA_STATE;
2292 wakaba 1.5 $self->{s_kwd} = '';
2293 wakaba 1.1
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304    
2305     return ($self->{ct}); # start tag or end tag
2306    
2307     redo A;
2308     } elsif ($self->{nc} == -1) {
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311    
2312     $self->{last_stag_name} = $self->{ct}->{tag_name};
2313     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318     ## NOTE: This state should never be reached.
2319    
2320     }
2321     } else {
2322     die "$0: $self->{ct}->{type}: Unknown token type";
2323     }
2324 wakaba 1.11 ## XML5: "Tag attribute name before state".
2325 wakaba 1.1 $self->{state} = DATA_STATE;
2326 wakaba 1.5 $self->{s_kwd} = '';
2327 wakaba 1.1 ## Reconsume.
2328     return ($self->{ct}); # start tag or end tag
2329     redo A;
2330     } else {
2331    
2332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333     ## TODO: This error type is wrong.
2334     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335     ## Reconsume.
2336     redo A;
2337     }
2338     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340    
2341 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342     ## consumes characters one-by-one basis.
2343    
2344     if ($self->{nc} == 0x003E) { # >
2345 wakaba 1.13 if ($self->{in_subset}) {
2346    
2347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348     } else {
2349    
2350     $self->{state} = DATA_STATE;
2351     $self->{s_kwd} = '';
2352     }
2353 wakaba 1.1
2354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355     $self->{line_prev} = $self->{line};
2356     $self->{column_prev} = $self->{column};
2357     $self->{column}++;
2358     $self->{nc}
2359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360     } else {
2361     $self->{set_nc}->($self);
2362     }
2363    
2364    
2365     return ($self->{ct}); # comment
2366     redo A;
2367     } elsif ($self->{nc} == -1) {
2368 wakaba 1.13 if ($self->{in_subset}) {
2369    
2370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371     } else {
2372    
2373     $self->{state} = DATA_STATE;
2374     $self->{s_kwd} = '';
2375     }
2376 wakaba 1.1 ## reconsume
2377    
2378     return ($self->{ct}); # comment
2379     redo A;
2380     } else {
2381    
2382     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383     $self->{read_until}->($self->{ct}->{data},
2384     q[>],
2385     length $self->{ct}->{data});
2386    
2387     ## Stay in the state.
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399     redo A;
2400     }
2401     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 wakaba 1.14 ## XML5: "Markup declaration state".
2403 wakaba 1.1
2404     if ($self->{nc} == 0x002D) { # -
2405    
2406     $self->{state} = MD_HYPHEN_STATE;
2407    
2408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409     $self->{line_prev} = $self->{line};
2410     $self->{column_prev} = $self->{column};
2411     $self->{column}++;
2412     $self->{nc}
2413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414     } else {
2415     $self->{set_nc}->($self);
2416     }
2417    
2418     redo A;
2419     } elsif ($self->{nc} == 0x0044 or # D
2420     $self->{nc} == 0x0064) { # d
2421     ## ASCII case-insensitive.
2422    
2423     $self->{state} = MD_DOCTYPE_STATE;
2424 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2425 wakaba 1.1
2426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427     $self->{line_prev} = $self->{line};
2428     $self->{column_prev} = $self->{column};
2429     $self->{column}++;
2430     $self->{nc}
2431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432     } else {
2433     $self->{set_nc}->($self);
2434     }
2435    
2436     redo A;
2437 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439     $self->{is_xml}) and
2440 wakaba 1.1 $self->{nc} == 0x005B) { # [
2441    
2442     $self->{state} = MD_CDATA_STATE;
2443 wakaba 1.12 $self->{kwd} = '[';
2444 wakaba 1.1
2445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446     $self->{line_prev} = $self->{line};
2447     $self->{column_prev} = $self->{column};
2448     $self->{column}++;
2449     $self->{nc}
2450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451     } else {
2452     $self->{set_nc}->($self);
2453     }
2454    
2455     redo A;
2456     } else {
2457    
2458     }
2459    
2460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461     line => $self->{line_prev},
2462     column => $self->{column_prev} - 1);
2463     ## Reconsume.
2464     $self->{state} = BOGUS_COMMENT_STATE;
2465     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466     line => $self->{line_prev},
2467     column => $self->{column_prev} - 1,
2468     };
2469     redo A;
2470     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471     if ($self->{nc} == 0x002D) { # -
2472    
2473     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474     line => $self->{line_prev},
2475     column => $self->{column_prev} - 2,
2476     };
2477 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478 wakaba 1.1
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } else {
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493     line => $self->{line_prev},
2494     column => $self->{column_prev} - 2);
2495     $self->{state} = BOGUS_COMMENT_STATE;
2496     ## Reconsume.
2497     $self->{ct} = {type => COMMENT_TOKEN,
2498     data => '-',
2499     line => $self->{line_prev},
2500     column => $self->{column_prev} - 2,
2501     };
2502     redo A;
2503     }
2504     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505     ## ASCII case-insensitive.
2506     if ($self->{nc} == [
2507     undef,
2508     0x004F, # O
2509     0x0043, # C
2510     0x0054, # T
2511     0x0059, # Y
2512     0x0050, # P
2513 wakaba 1.12 ]->[length $self->{kwd}] or
2514 wakaba 1.1 $self->{nc} == [
2515     undef,
2516     0x006F, # o
2517     0x0063, # c
2518     0x0074, # t
2519     0x0079, # y
2520     0x0070, # p
2521 wakaba 1.12 ]->[length $self->{kwd}]) {
2522 wakaba 1.1
2523     ## Stay in the state.
2524 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2525 wakaba 1.1
2526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527     $self->{line_prev} = $self->{line};
2528     $self->{column_prev} = $self->{column};
2529     $self->{column}++;
2530     $self->{nc}
2531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532     } else {
2533     $self->{set_nc}->($self);
2534     }
2535    
2536     redo A;
2537 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2538 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2539     $self->{nc} == 0x0065)) { # e
2540 wakaba 1.12 if ($self->{is_xml} and
2541     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542 wakaba 1.10
2543     ## XML5: case-sensitive.
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545     text => 'DOCTYPE',
2546     line => $self->{line_prev},
2547     column => $self->{column_prev} - 5);
2548     } else {
2549    
2550     }
2551 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2552     $self->{ct} = {type => DOCTYPE_TOKEN,
2553     quirks => 1,
2554     line => $self->{line_prev},
2555     column => $self->{column_prev} - 7,
2556     };
2557    
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568     redo A;
2569     } else {
2570    
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572     line => $self->{line_prev},
2573 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2575     ## Reconsume.
2576     $self->{ct} = {type => COMMENT_TOKEN,
2577 wakaba 1.12 data => $self->{kwd},
2578 wakaba 1.1 line => $self->{line_prev},
2579 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 wakaba 1.1 };
2581     redo A;
2582     }
2583     } elsif ($self->{state} == MD_CDATA_STATE) {
2584     if ($self->{nc} == {
2585     '[' => 0x0043, # C
2586     '[C' => 0x0044, # D
2587     '[CD' => 0x0041, # A
2588     '[CDA' => 0x0054, # T
2589     '[CDAT' => 0x0041, # A
2590 wakaba 1.12 }->{$self->{kwd}}) {
2591 wakaba 1.1
2592     ## Stay in the state.
2593 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2594 wakaba 1.1
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605     redo A;
2606 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2607 wakaba 1.1 $self->{nc} == 0x005B) { # [
2608 wakaba 1.6 if ($self->{is_xml} and
2609     not $self->{tainted} and
2610     @{$self->{open_elements} or []} == 0) {
2611 wakaba 1.8
2612 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613     line => $self->{line_prev},
2614     column => $self->{column_prev} - 7);
2615     $self->{tainted} = 1;
2616 wakaba 1.8 } else {
2617    
2618 wakaba 1.6 }
2619    
2620 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2621     data => '',
2622     line => $self->{line_prev},
2623     column => $self->{column_prev} - 7};
2624     $self->{state} = CDATA_SECTION_STATE;
2625    
2626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627     $self->{line_prev} = $self->{line};
2628     $self->{column_prev} = $self->{column};
2629     $self->{column}++;
2630     $self->{nc}
2631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632     } else {
2633     $self->{set_nc}->($self);
2634     }
2635    
2636     redo A;
2637     } else {
2638    
2639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640     line => $self->{line_prev},
2641 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2643     ## Reconsume.
2644     $self->{ct} = {type => COMMENT_TOKEN,
2645 wakaba 1.12 data => $self->{kwd},
2646 wakaba 1.1 line => $self->{line_prev},
2647 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 wakaba 1.1 };
2649     redo A;
2650     }
2651     } elsif ($self->{state} == COMMENT_START_STATE) {
2652     if ($self->{nc} == 0x002D) { # -
2653    
2654     $self->{state} = COMMENT_START_DASH_STATE;
2655    
2656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657     $self->{line_prev} = $self->{line};
2658     $self->{column_prev} = $self->{column};
2659     $self->{column}++;
2660     $self->{nc}
2661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662     } else {
2663     $self->{set_nc}->($self);
2664     }
2665    
2666     redo A;
2667     } elsif ($self->{nc} == 0x003E) { # >
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 wakaba 1.13 if ($self->{in_subset}) {
2670    
2671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672     } else {
2673    
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     }
2677 wakaba 1.1
2678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679     $self->{line_prev} = $self->{line};
2680     $self->{column_prev} = $self->{column};
2681     $self->{column}++;
2682     $self->{nc}
2683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684     } else {
2685     $self->{set_nc}->($self);
2686     }
2687    
2688    
2689     return ($self->{ct}); # comment
2690    
2691     redo A;
2692     } elsif ($self->{nc} == -1) {
2693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 wakaba 1.13 if ($self->{in_subset}) {
2695    
2696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697     } else {
2698    
2699     $self->{state} = DATA_STATE;
2700     $self->{s_kwd} = '';
2701     }
2702 wakaba 1.1 ## reconsume
2703    
2704     return ($self->{ct}); # comment
2705    
2706     redo A;
2707     } else {
2708    
2709     $self->{ct}->{data} # comment
2710     .= chr ($self->{nc});
2711     $self->{state} = COMMENT_STATE;
2712    
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723     redo A;
2724     }
2725     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726     if ($self->{nc} == 0x002D) { # -
2727    
2728     $self->{state} = COMMENT_END_STATE;
2729    
2730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731     $self->{line_prev} = $self->{line};
2732     $self->{column_prev} = $self->{column};
2733     $self->{column}++;
2734     $self->{nc}
2735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736     } else {
2737     $self->{set_nc}->($self);
2738     }
2739    
2740     redo A;
2741     } elsif ($self->{nc} == 0x003E) { # >
2742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 wakaba 1.13 if ($self->{in_subset}) {
2744    
2745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746     } else {
2747    
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     }
2751 wakaba 1.1
2752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753     $self->{line_prev} = $self->{line};
2754     $self->{column_prev} = $self->{column};
2755     $self->{column}++;
2756     $self->{nc}
2757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758     } else {
2759     $self->{set_nc}->($self);
2760     }
2761    
2762    
2763     return ($self->{ct}); # comment
2764    
2765     redo A;
2766     } elsif ($self->{nc} == -1) {
2767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 wakaba 1.13 if ($self->{in_subset}) {
2769    
2770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771     } else {
2772    
2773     $self->{state} = DATA_STATE;
2774     $self->{s_kwd} = '';
2775     }
2776 wakaba 1.1 ## reconsume
2777    
2778     return ($self->{ct}); # comment
2779    
2780     redo A;
2781     } else {
2782    
2783     $self->{ct}->{data} # comment
2784     .= '-' . chr ($self->{nc});
2785     $self->{state} = COMMENT_STATE;
2786    
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797     redo A;
2798     }
2799     } elsif ($self->{state} == COMMENT_STATE) {
2800 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2801    
2802 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2803    
2804     $self->{state} = COMMENT_END_DASH_STATE;
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } elsif ($self->{nc} == -1) {
2818     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 wakaba 1.13 if ($self->{in_subset}) {
2820    
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     } else {
2823    
2824     $self->{state} = DATA_STATE;
2825     $self->{s_kwd} = '';
2826     }
2827 wakaba 1.1 ## reconsume
2828    
2829     return ($self->{ct}); # comment
2830    
2831     redo A;
2832     } else {
2833    
2834     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835     $self->{read_until}->($self->{ct}->{data},
2836     q[-],
2837     length $self->{ct}->{data});
2838    
2839     ## Stay in the state
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     }
2853     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855 wakaba 1.10
2856 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2857    
2858     $self->{state} = COMMENT_END_STATE;
2859    
2860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861     $self->{line_prev} = $self->{line};
2862     $self->{column_prev} = $self->{column};
2863     $self->{column}++;
2864     $self->{nc}
2865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866     } else {
2867     $self->{set_nc}->($self);
2868     }
2869    
2870     redo A;
2871     } elsif ($self->{nc} == -1) {
2872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 wakaba 1.13 if ($self->{in_subset}) {
2874    
2875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876     } else {
2877    
2878     $self->{state} = DATA_STATE;
2879     $self->{s_kwd} = '';
2880     }
2881 wakaba 1.1 ## reconsume
2882    
2883     return ($self->{ct}); # comment
2884    
2885     redo A;
2886     } else {
2887    
2888     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889     $self->{state} = COMMENT_STATE;
2890    
2891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892     $self->{line_prev} = $self->{line};
2893     $self->{column_prev} = $self->{column};
2894     $self->{column}++;
2895     $self->{nc}
2896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897     } else {
2898     $self->{set_nc}->($self);
2899     }
2900    
2901     redo A;
2902     }
2903     } elsif ($self->{state} == COMMENT_END_STATE) {
2904 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905    
2906 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2907 wakaba 1.13 if ($self->{in_subset}) {
2908    
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910     } else {
2911    
2912     $self->{state} = DATA_STATE;
2913     $self->{s_kwd} = '';
2914     }
2915 wakaba 1.1
2916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917     $self->{line_prev} = $self->{line};
2918     $self->{column_prev} = $self->{column};
2919     $self->{column}++;
2920     $self->{nc}
2921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922     } else {
2923     $self->{set_nc}->($self);
2924     }
2925    
2926    
2927     return ($self->{ct}); # comment
2928    
2929     redo A;
2930     } elsif ($self->{nc} == 0x002D) { # -
2931    
2932 wakaba 1.10 ## XML5: Not a parse error.
2933 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934     line => $self->{line_prev},
2935     column => $self->{column_prev});
2936     $self->{ct}->{data} .= '-'; # comment
2937     ## Stay in the state
2938    
2939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940     $self->{line_prev} = $self->{line};
2941     $self->{column_prev} = $self->{column};
2942     $self->{column}++;
2943     $self->{nc}
2944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945     } else {
2946     $self->{set_nc}->($self);
2947     }
2948    
2949     redo A;
2950     } elsif ($self->{nc} == -1) {
2951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1 ## reconsume
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } else {
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972     $self->{state} = COMMENT_STATE;
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     }
2986     } elsif ($self->{state} == DOCTYPE_STATE) {
2987     if ($is_space->{$self->{nc}}) {
2988    
2989     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990    
2991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002     } else {
3003    
3004 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3005 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007     ## reconsume
3008     redo A;
3009     }
3010     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3012    
3013 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3014    
3015     ## Stay in the state
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     } elsif ($self->{nc} == 0x003E) { # >
3029    
3030 wakaba 1.12 ## XML5: No parse error.
3031 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032     $self->{state} = DATA_STATE;
3033 wakaba 1.5 $self->{s_kwd} = '';
3034 wakaba 1.1
3035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036     $self->{line_prev} = $self->{line};
3037     $self->{column_prev} = $self->{column};
3038     $self->{column}++;
3039     $self->{nc}
3040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041     } else {
3042     $self->{set_nc}->($self);
3043     }
3044    
3045    
3046     return ($self->{ct}); # DOCTYPE (quirks)
3047    
3048     redo A;
3049     } elsif ($self->{nc} == -1) {
3050    
3051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052     $self->{state} = DATA_STATE;
3053 wakaba 1.5 $self->{s_kwd} = '';
3054 wakaba 1.1 ## reconsume
3055    
3056     return ($self->{ct}); # DOCTYPE (quirks)
3057    
3058     redo A;
3059 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060    
3061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064     $self->{in_subset} = 1;
3065 wakaba 1.12
3066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067     $self->{line_prev} = $self->{line};
3068     $self->{column_prev} = $self->{column};
3069     $self->{column}++;
3070     $self->{nc}
3071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072     } else {
3073     $self->{set_nc}->($self);
3074     }
3075    
3076 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3077 wakaba 1.12 redo A;
3078 wakaba 1.1 } else {
3079    
3080     $self->{ct}->{name} = chr $self->{nc};
3081     delete $self->{ct}->{quirks};
3082     $self->{state} = DOCTYPE_NAME_STATE;
3083    
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094     redo A;
3095     }
3096     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3098    
3099     ## ISSUE: Redundant "First," in the spec.
3100    
3101 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3102    
3103     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x003E) { # >
3117    
3118     $self->{state} = DATA_STATE;
3119 wakaba 1.5 $self->{s_kwd} = '';
3120 wakaba 1.1
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131    
3132     return ($self->{ct}); # DOCTYPE
3133    
3134     redo A;
3135     } elsif ($self->{nc} == -1) {
3136    
3137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138     $self->{state} = DATA_STATE;
3139 wakaba 1.5 $self->{s_kwd} = '';
3140 wakaba 1.1 ## reconsume
3141    
3142     $self->{ct}->{quirks} = 1;
3143     return ($self->{ct}); # DOCTYPE
3144    
3145     redo A;
3146 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147    
3148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150     $self->{in_subset} = 1;
3151 wakaba 1.12
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3163 wakaba 1.12 redo A;
3164 wakaba 1.1 } else {
3165    
3166     $self->{ct}->{name}
3167     .= chr ($self->{nc}); # DOCTYPE
3168     ## Stay in the state
3169    
3170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171     $self->{line_prev} = $self->{line};
3172     $self->{column_prev} = $self->{column};
3173     $self->{column}++;
3174     $self->{nc}
3175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176     } else {
3177     $self->{set_nc}->($self);
3178     }
3179    
3180     redo A;
3181     }
3182     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184     ## state", but implemented differently.
3185    
3186 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3187    
3188     ## Stay in the state
3189    
3190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191     $self->{line_prev} = $self->{line};
3192     $self->{column_prev} = $self->{column};
3193     $self->{column}++;
3194     $self->{nc}
3195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196     } else {
3197     $self->{set_nc}->($self);
3198     }
3199    
3200     redo A;
3201     } elsif ($self->{nc} == 0x003E) { # >
3202 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203    
3204     $self->{state} = DATA_STATE;
3205     $self->{s_kwd} = '';
3206     } else {
3207    
3208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210     }
3211 wakaba 1.1
3212    
3213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214     $self->{line_prev} = $self->{line};
3215     $self->{column_prev} = $self->{column};
3216     $self->{column}++;
3217     $self->{nc}
3218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219     } else {
3220     $self->{set_nc}->($self);
3221     }
3222    
3223 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 wakaba 1.1 redo A;
3225     } elsif ($self->{nc} == -1) {
3226 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227    
3228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229     $self->{state} = DATA_STATE;
3230     $self->{s_kwd} = '';
3231     $self->{ct}->{quirks} = 1;
3232     } else {
3233    
3234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236     }
3237 wakaba 1.1
3238 wakaba 1.16 ## Reconsume.
3239     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 wakaba 1.1 redo A;
3241     } elsif ($self->{nc} == 0x0050 or # P
3242     $self->{nc} == 0x0070) { # p
3243 wakaba 1.12
3244 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3245 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3246 wakaba 1.1
3247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248     $self->{line_prev} = $self->{line};
3249     $self->{column_prev} = $self->{column};
3250     $self->{column}++;
3251     $self->{nc}
3252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253     } else {
3254     $self->{set_nc}->($self);
3255     }
3256    
3257     redo A;
3258     } elsif ($self->{nc} == 0x0053 or # S
3259     $self->{nc} == 0x0073) { # s
3260 wakaba 1.12
3261 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3262 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3276     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278    
3279     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280     $self->{ct}->{value} = ''; # ENTITY
3281    
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     } elsif ($self->{nc} == 0x0027 and # '
3294     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296    
3297     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298     $self->{ct}->{value} = ''; # ENTITY
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311 wakaba 1.16 } elsif ($self->{is_xml} and
3312     $self->{ct}->{type} == DOCTYPE_TOKEN and
3313     $self->{nc} == 0x005B) { # [
3314 wakaba 1.12
3315     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 wakaba 1.13 $self->{in_subset} = 1;
3318 wakaba 1.1
3319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320     $self->{line_prev} = $self->{line};
3321     $self->{column_prev} = $self->{column};
3322     $self->{column}++;
3323     $self->{nc}
3324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325     } else {
3326     $self->{set_nc}->($self);
3327     }
3328    
3329 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3330 wakaba 1.1 redo A;
3331     } else {
3332 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333    
3334     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335    
3336     $self->{ct}->{quirks} = 1;
3337     $self->{state} = BOGUS_DOCTYPE_STATE;
3338     } else {
3339    
3340     $self->{state} = BOGUS_MD_STATE;
3341     }
3342 wakaba 1.1
3343    
3344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345     $self->{line_prev} = $self->{line};
3346     $self->{column_prev} = $self->{column};
3347     $self->{column}++;
3348     $self->{nc}
3349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350     } else {
3351     $self->{set_nc}->($self);
3352     }
3353    
3354     redo A;
3355     }
3356     } elsif ($self->{state} == PUBLIC_STATE) {
3357     ## ASCII case-insensitive
3358     if ($self->{nc} == [
3359     undef,
3360     0x0055, # U
3361     0x0042, # B
3362     0x004C, # L
3363     0x0049, # I
3364 wakaba 1.12 ]->[length $self->{kwd}] or
3365 wakaba 1.1 $self->{nc} == [
3366     undef,
3367     0x0075, # u
3368     0x0062, # b
3369     0x006C, # l
3370     0x0069, # i
3371 wakaba 1.12 ]->[length $self->{kwd}]) {
3372 wakaba 1.1
3373     ## Stay in the state.
3374 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3375 wakaba 1.1
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3388 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3389     $self->{nc} == 0x0063)) { # c
3390 wakaba 1.12 if ($self->{is_xml} and
3391     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392    
3393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394     text => 'PUBLIC',
3395     line => $self->{line_prev},
3396     column => $self->{column_prev} - 4);
3397     } else {
3398    
3399     }
3400 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     } else {
3414 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 wakaba 1.1 line => $self->{line_prev},
3416 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418    
3419     $self->{ct}->{quirks} = 1;
3420     $self->{state} = BOGUS_DOCTYPE_STATE;
3421     } else {
3422    
3423     $self->{state} = BOGUS_MD_STATE;
3424     }
3425 wakaba 1.1 ## Reconsume.
3426     redo A;
3427     }
3428     } elsif ($self->{state} == SYSTEM_STATE) {
3429     ## ASCII case-insensitive
3430     if ($self->{nc} == [
3431     undef,
3432     0x0059, # Y
3433     0x0053, # S
3434     0x0054, # T
3435     0x0045, # E
3436 wakaba 1.12 ]->[length $self->{kwd}] or
3437 wakaba 1.1 $self->{nc} == [
3438     undef,
3439     0x0079, # y
3440     0x0073, # s
3441     0x0074, # t
3442     0x0065, # e
3443 wakaba 1.12 ]->[length $self->{kwd}]) {
3444 wakaba 1.1
3445     ## Stay in the state.
3446 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3447 wakaba 1.1
3448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449     $self->{line_prev} = $self->{line};
3450     $self->{column_prev} = $self->{column};
3451     $self->{column}++;
3452     $self->{nc}
3453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454     } else {
3455     $self->{set_nc}->($self);
3456     }
3457    
3458     redo A;
3459 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3460 wakaba 1.1 ($self->{nc} == 0x004D or # M
3461     $self->{nc} == 0x006D)) { # m
3462 wakaba 1.12 if ($self->{is_xml} and
3463     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464    
3465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466     text => 'SYSTEM',
3467     line => $self->{line_prev},
3468     column => $self->{column_prev} - 4);
3469     } else {
3470    
3471     }
3472 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473    
3474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475     $self->{line_prev} = $self->{line};
3476     $self->{column_prev} = $self->{column};
3477     $self->{column}++;
3478     $self->{nc}
3479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480     } else {
3481     $self->{set_nc}->($self);
3482     }
3483    
3484     redo A;
3485     } else {
3486 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 wakaba 1.1 line => $self->{line_prev},
3488 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490    
3491     $self->{ct}->{quirks} = 1;
3492     $self->{state} = BOGUS_DOCTYPE_STATE;
3493     } else {
3494    
3495     $self->{state} = BOGUS_MD_STATE;
3496     }
3497 wakaba 1.1 ## Reconsume.
3498     redo A;
3499     }
3500     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501     if ($is_space->{$self->{nc}}) {
3502    
3503     ## Stay in the state
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } elsif ($self->{nc} eq 0x0022) { # "
3517    
3518     $self->{ct}->{pubid} = ''; # DOCTYPE
3519     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520    
3521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522     $self->{line_prev} = $self->{line};
3523     $self->{column_prev} = $self->{column};
3524     $self->{column}++;
3525     $self->{nc}
3526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527     } else {
3528     $self->{set_nc}->($self);
3529     }
3530    
3531     redo A;
3532     } elsif ($self->{nc} eq 0x0027) { # '
3533    
3534     $self->{ct}->{pubid} = ''; # DOCTYPE
3535     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536    
3537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538     $self->{line_prev} = $self->{line};
3539     $self->{column_prev} = $self->{column};
3540     $self->{column}++;
3541     $self->{nc}
3542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543     } else {
3544     $self->{set_nc}->($self);
3545     }
3546    
3547     redo A;
3548     } elsif ($self->{nc} eq 0x003E) { # >
3549 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550    
3551     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552    
3553     $self->{state} = DATA_STATE;
3554     $self->{s_kwd} = '';
3555     $self->{ct}->{quirks} = 1;
3556     } else {
3557    
3558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559     }
3560 wakaba 1.1
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 wakaba 1.1 redo A;
3574     } elsif ($self->{nc} == -1) {
3575 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576    
3577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578     $self->{state} = DATA_STATE;
3579     $self->{s_kwd} = '';
3580     $self->{ct}->{quirks} = 1;
3581     } else {
3582    
3583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585     }
3586 wakaba 1.1
3587     ## reconsume
3588     return ($self->{ct}); # DOCTYPE
3589     redo A;
3590 wakaba 1.16 } elsif ($self->{is_xml} and
3591     $self->{ct}->{type} == DOCTYPE_TOKEN and
3592     $self->{nc} == 0x005B) { # [
3593 wakaba 1.12
3594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 wakaba 1.13 $self->{in_subset} = 1;
3598 wakaba 1.12
3599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600     $self->{line_prev} = $self->{line};
3601     $self->{column_prev} = $self->{column};
3602     $self->{column}++;
3603     $self->{nc}
3604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605     } else {
3606     $self->{set_nc}->($self);
3607     }
3608    
3609 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3610 wakaba 1.12 redo A;
3611 wakaba 1.1 } else {
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613    
3614 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615    
3616     $self->{ct}->{quirks} = 1;
3617     $self->{state} = BOGUS_DOCTYPE_STATE;
3618     } else {
3619    
3620     $self->{state} = BOGUS_MD_STATE;
3621     }
3622    
3623 wakaba 1.1
3624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625     $self->{line_prev} = $self->{line};
3626     $self->{column_prev} = $self->{column};
3627     $self->{column}++;
3628     $self->{nc}
3629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630     } else {
3631     $self->{set_nc}->($self);
3632     }
3633    
3634     redo A;
3635     }
3636     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637     if ($self->{nc} == 0x0022) { # "
3638    
3639     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640    
3641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642     $self->{line_prev} = $self->{line};
3643     $self->{column_prev} = $self->{column};
3644     $self->{column}++;
3645     $self->{nc}
3646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647     } else {
3648     $self->{set_nc}->($self);
3649     }
3650    
3651     redo A;
3652     } elsif ($self->{nc} == 0x003E) { # >
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654    
3655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656    
3657     $self->{state} = DATA_STATE;
3658     $self->{s_kwd} = '';
3659     $self->{ct}->{quirks} = 1;
3660     } else {
3661    
3662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663     }
3664    
3665 wakaba 1.1
3666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667     $self->{line_prev} = $self->{line};
3668     $self->{column_prev} = $self->{column};
3669     $self->{column}++;
3670     $self->{nc}
3671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672     } else {
3673     $self->{set_nc}->($self);
3674     }
3675    
3676 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 wakaba 1.1 redo A;
3678     } elsif ($self->{nc} == -1) {
3679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680    
3681 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682    
3683     $self->{state} = DATA_STATE;
3684     $self->{s_kwd} = '';
3685     $self->{ct}->{quirks} = 1;
3686     } else {
3687    
3688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689     }
3690    
3691     ## Reconsume.
3692 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3693     redo A;
3694     } else {
3695    
3696 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698     length $self->{ct}->{pubid});
3699    
3700     ## Stay in the state
3701    
3702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703     $self->{line_prev} = $self->{line};
3704     $self->{column_prev} = $self->{column};
3705     $self->{column}++;
3706     $self->{nc}
3707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708     } else {
3709     $self->{set_nc}->($self);
3710     }
3711    
3712     redo A;
3713     }
3714     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715     if ($self->{nc} == 0x0027) { # '
3716    
3717     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718    
3719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720     $self->{line_prev} = $self->{line};
3721     $self->{column_prev} = $self->{column};
3722     $self->{column}++;
3723     $self->{nc}
3724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725     } else {
3726     $self->{set_nc}->($self);
3727     }
3728    
3729     redo A;
3730     } elsif ($self->{nc} == 0x003E) { # >
3731     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732    
3733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734    
3735     $self->{state} = DATA_STATE;
3736     $self->{s_kwd} = '';
3737     $self->{ct}->{quirks} = 1;
3738     } else {
3739    
3740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741     }
3742    
3743 wakaba 1.1
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 wakaba 1.1 redo A;
3756     } elsif ($self->{nc} == -1) {
3757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758    
3759 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760    
3761     $self->{state} = DATA_STATE;
3762     $self->{s_kwd} = '';
3763     $self->{ct}->{quirks} = 1;
3764     } else {
3765    
3766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767     }
3768    
3769 wakaba 1.1 ## reconsume
3770 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 wakaba 1.1 redo A;
3772     } else {
3773    
3774 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776     length $self->{ct}->{pubid});
3777    
3778     ## Stay in the state
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     }
3792     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793     if ($is_space->{$self->{nc}}) {
3794    
3795     ## Stay in the state
3796    
3797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798     $self->{line_prev} = $self->{line};
3799     $self->{column_prev} = $self->{column};
3800     $self->{column}++;
3801     $self->{nc}
3802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803     } else {
3804     $self->{set_nc}->($self);
3805     }
3806    
3807     redo A;
3808     } elsif ($self->{nc} == 0x0022) { # "
3809    
3810 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812    
3813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814     $self->{line_prev} = $self->{line};
3815     $self->{column_prev} = $self->{column};
3816     $self->{column}++;
3817     $self->{nc}
3818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819     } else {
3820     $self->{set_nc}->($self);
3821     }
3822    
3823     redo A;
3824     } elsif ($self->{nc} == 0x0027) { # '
3825    
3826 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828    
3829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830     $self->{line_prev} = $self->{line};
3831     $self->{column_prev} = $self->{column};
3832     $self->{column}++;
3833     $self->{nc}
3834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835     } else {
3836     $self->{set_nc}->($self);
3837     }
3838    
3839     redo A;
3840     } elsif ($self->{nc} == 0x003E) { # >
3841 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842     if ($self->{is_xml}) {
3843    
3844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845     } else {
3846    
3847     }
3848     $self->{state} = DATA_STATE;
3849     $self->{s_kwd} = '';
3850 wakaba 1.12 } else {
3851 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852    
3853     } else {
3854    
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856     }
3857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 wakaba 1.12 }
3859 wakaba 1.16
3860 wakaba 1.1
3861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862     $self->{line_prev} = $self->{line};
3863     $self->{column_prev} = $self->{column};
3864     $self->{column}++;
3865     $self->{nc}
3866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867     } else {
3868     $self->{set_nc}->($self);
3869     }
3870    
3871 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 wakaba 1.1 redo A;
3873     } elsif ($self->{nc} == -1) {
3874 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875    
3876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877    
3878     $self->{state} = DATA_STATE;
3879     $self->{s_kwd} = '';
3880     $self->{ct}->{quirks} = 1;
3881     } else {
3882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884     }
3885 wakaba 1.1
3886     ## reconsume
3887 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 wakaba 1.1 redo A;
3889 wakaba 1.16 } elsif ($self->{is_xml} and
3890     $self->{ct}->{type} == DOCTYPE_TOKEN and
3891     $self->{nc} == 0x005B) { # [
3892 wakaba 1.12
3893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 wakaba 1.13 $self->{in_subset} = 1;
3897 wakaba 1.12
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3909 wakaba 1.12 redo A;
3910 wakaba 1.1 } else {
3911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912    
3913 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914    
3915     $self->{ct}->{quirks} = 1;
3916     $self->{state} = BOGUS_DOCTYPE_STATE;
3917     } else {
3918    
3919     $self->{state} = BOGUS_MD_STATE;
3920     }
3921    
3922 wakaba 1.1
3923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924     $self->{line_prev} = $self->{line};
3925     $self->{column_prev} = $self->{column};
3926     $self->{column}++;
3927     $self->{nc}
3928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929     } else {
3930     $self->{set_nc}->($self);
3931     }
3932    
3933     redo A;
3934     }
3935     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936     if ($is_space->{$self->{nc}}) {
3937    
3938     ## Stay in the state
3939    
3940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941     $self->{line_prev} = $self->{line};
3942     $self->{column_prev} = $self->{column};
3943     $self->{column}++;
3944     $self->{nc}
3945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946     } else {
3947     $self->{set_nc}->($self);
3948     }
3949    
3950     redo A;
3951     } elsif ($self->{nc} == 0x0022) { # "
3952    
3953     $self->{ct}->{sysid} = ''; # DOCTYPE
3954     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955    
3956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957     $self->{line_prev} = $self->{line};
3958     $self->{column_prev} = $self->{column};
3959     $self->{column}++;
3960     $self->{nc}
3961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962     } else {
3963     $self->{set_nc}->($self);
3964     }
3965    
3966     redo A;
3967     } elsif ($self->{nc} == 0x0027) { # '
3968    
3969     $self->{ct}->{sysid} = ''; # DOCTYPE
3970     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971    
3972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973     $self->{line_prev} = $self->{line};
3974     $self->{column_prev} = $self->{column};
3975     $self->{column}++;
3976     $self->{nc}
3977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978     } else {
3979     $self->{set_nc}->($self);
3980     }
3981    
3982     redo A;
3983     } elsif ($self->{nc} == 0x003E) { # >
3984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987     $self->{line_prev} = $self->{line};
3988     $self->{column_prev} = $self->{column};
3989     $self->{column}++;
3990     $self->{nc}
3991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992     } else {
3993     $self->{set_nc}->($self);
3994     }
3995    
3996    
3997 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998    
3999     $self->{state} = DATA_STATE;
4000     $self->{s_kwd} = '';
4001     $self->{ct}->{quirks} = 1;
4002     } else {
4003    
4004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005     }
4006 wakaba 1.1
4007 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 wakaba 1.1 redo A;
4009     } elsif ($self->{nc} == -1) {
4010 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011    
4012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013     $self->{state} = DATA_STATE;
4014     $self->{s_kwd} = '';
4015     $self->{ct}->{quirks} = 1;
4016     } else {
4017    
4018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020     }
4021 wakaba 1.1
4022     ## reconsume
4023 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 wakaba 1.1 redo A;
4025 wakaba 1.16 } elsif ($self->{is_xml} and
4026     $self->{ct}->{type} == DOCTYPE_TOKEN and
4027     $self->{nc} == 0x005B) { # [
4028 wakaba 1.12
4029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030    
4031     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 wakaba 1.13 $self->{in_subset} = 1;
4034 wakaba 1.12
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4046 wakaba 1.12 redo A;
4047 wakaba 1.1 } else {
4048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049    
4050 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051    
4052     $self->{ct}->{quirks} = 1;
4053     $self->{state} = BOGUS_DOCTYPE_STATE;
4054     } else {
4055    
4056     $self->{state} = BOGUS_MD_STATE;
4057     }
4058    
4059 wakaba 1.1
4060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061     $self->{line_prev} = $self->{line};
4062     $self->{column_prev} = $self->{column};
4063     $self->{column}++;
4064     $self->{nc}
4065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066     } else {
4067     $self->{set_nc}->($self);
4068     }
4069    
4070     redo A;
4071     }
4072     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073     if ($self->{nc} == 0x0022) { # "
4074    
4075     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076    
4077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078     $self->{line_prev} = $self->{line};
4079     $self->{column_prev} = $self->{column};
4080     $self->{column}++;
4081     $self->{nc}
4082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083     } else {
4084     $self->{set_nc}->($self);
4085     }
4086    
4087     redo A;
4088 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090    
4091 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092    
4093     $self->{state} = DATA_STATE;
4094     $self->{s_kwd} = '';
4095     $self->{ct}->{quirks} = 1;
4096     } else {
4097    
4098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099     }
4100    
4101 wakaba 1.1
4102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103     $self->{line_prev} = $self->{line};
4104     $self->{column_prev} = $self->{column};
4105     $self->{column}++;
4106     $self->{nc}
4107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108     } else {
4109     $self->{set_nc}->($self);
4110     }
4111    
4112 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 wakaba 1.1 redo A;
4114     } elsif ($self->{nc} == -1) {
4115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116    
4117 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118    
4119     $self->{state} = DATA_STATE;
4120     $self->{s_kwd} = '';
4121     $self->{ct}->{quirks} = 1;
4122     } else {
4123    
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125     }
4126    
4127 wakaba 1.1 ## reconsume
4128 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 wakaba 1.1 redo A;
4130     } else {
4131    
4132 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134     length $self->{ct}->{sysid});
4135    
4136     ## Stay in the state
4137    
4138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139     $self->{line_prev} = $self->{line};
4140     $self->{column_prev} = $self->{column};
4141     $self->{column}++;
4142     $self->{nc}
4143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144     } else {
4145     $self->{set_nc}->($self);
4146     }
4147    
4148     redo A;
4149     }
4150     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151     if ($self->{nc} == 0x0027) { # '
4152    
4153     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154    
4155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156     $self->{line_prev} = $self->{line};
4157     $self->{column_prev} = $self->{column};
4158     $self->{column}++;
4159     $self->{nc}
4160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161     } else {
4162     $self->{set_nc}->($self);
4163     }
4164    
4165     redo A;
4166 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167 wakaba 1.1
4168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169    
4170     $self->{state} = DATA_STATE;
4171 wakaba 1.5 $self->{s_kwd} = '';
4172 wakaba 1.1
4173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174     $self->{line_prev} = $self->{line};
4175     $self->{column_prev} = $self->{column};
4176     $self->{column}++;
4177     $self->{nc}
4178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179     } else {
4180     $self->{set_nc}->($self);
4181     }
4182    
4183    
4184     $self->{ct}->{quirks} = 1;
4185     return ($self->{ct}); # DOCTYPE
4186    
4187     redo A;
4188     } elsif ($self->{nc} == -1) {
4189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190    
4191 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192    
4193     $self->{state} = DATA_STATE;
4194     $self->{s_kwd} = '';
4195     $self->{ct}->{quirks} = 1;
4196     } else {
4197    
4198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199     }
4200    
4201 wakaba 1.1 ## reconsume
4202 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 wakaba 1.1 redo A;
4204     } else {
4205    
4206 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208     length $self->{ct}->{sysid});
4209    
4210     ## Stay in the state
4211    
4212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213     $self->{line_prev} = $self->{line};
4214     $self->{column_prev} = $self->{column};
4215     $self->{column}++;
4216     $self->{nc}
4217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218     } else {
4219     $self->{set_nc}->($self);
4220     }
4221    
4222     redo A;
4223     }
4224     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225     if ($is_space->{$self->{nc}}) {
4226 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227    
4228     $self->{state} = BEFORE_NDATA_STATE;
4229     } else {
4230    
4231     ## Stay in the state
4232     }
4233 wakaba 1.1
4234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235     $self->{line_prev} = $self->{line};
4236     $self->{column_prev} = $self->{column};
4237     $self->{column}++;
4238     $self->{nc}
4239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240     } else {
4241     $self->{set_nc}->($self);
4242     }
4243    
4244     redo A;
4245     } elsif ($self->{nc} == 0x003E) { # >
4246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247    
4248     $self->{state} = DATA_STATE;
4249     $self->{s_kwd} = '';
4250     } else {
4251    
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253     }
4254    
4255 wakaba 1.1
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 wakaba 1.1 redo A;
4268 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269     ($self->{nc} == 0x004E or # N
4270     $self->{nc} == 0x006E)) { # n
4271    
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273     $self->{state} = NDATA_STATE;
4274     $self->{kwd} = chr $self->{nc};
4275    
4276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277     $self->{line_prev} = $self->{line};
4278     $self->{column_prev} = $self->{column};
4279     $self->{column}++;
4280     $self->{nc}
4281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282     } else {
4283     $self->{set_nc}->($self);
4284     }
4285    
4286     redo A;
4287 wakaba 1.1 } elsif ($self->{nc} == -1) {
4288 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289    
4290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291     $self->{state} = DATA_STATE;
4292     $self->{s_kwd} = '';
4293     $self->{ct}->{quirks} = 1;
4294     } else {
4295    
4296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298     }
4299    
4300 wakaba 1.1 ## reconsume
4301 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 wakaba 1.1 redo A;
4303 wakaba 1.16 } elsif ($self->{is_xml} and
4304     $self->{ct}->{type} == DOCTYPE_TOKEN and
4305     $self->{nc} == 0x005B) { # [
4306 wakaba 1.12
4307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 wakaba 1.13 $self->{in_subset} = 1;
4310 wakaba 1.12
4311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312     $self->{line_prev} = $self->{line};
4313     $self->{column_prev} = $self->{column};
4314     $self->{column}++;
4315     $self->{nc}
4316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317     } else {
4318     $self->{set_nc}->($self);
4319     }
4320    
4321 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4322 wakaba 1.12 redo A;
4323 wakaba 1.1 } else {
4324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325    
4326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327    
4328     #$self->{ct}->{quirks} = 1;
4329     $self->{state} = BOGUS_DOCTYPE_STATE;
4330     } else {
4331    
4332     $self->{state} = BOGUS_MD_STATE;
4333     }
4334    
4335 wakaba 1.1
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     }
4348 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349     if ($is_space->{$self->{nc}}) {
4350    
4351     ## Stay in the state.
4352    
4353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354     $self->{line_prev} = $self->{line};
4355     $self->{column_prev} = $self->{column};
4356     $self->{column}++;
4357     $self->{nc}
4358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359     } else {
4360     $self->{set_nc}->($self);
4361     }
4362    
4363     redo A;
4364     } elsif ($self->{nc} == 0x003E) { # >
4365    
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367    
4368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369     $self->{line_prev} = $self->{line};
4370     $self->{column_prev} = $self->{column};
4371     $self->{column}++;
4372     $self->{nc}
4373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374     } else {
4375     $self->{set_nc}->($self);
4376     }
4377    
4378     return ($self->{ct}); # ENTITY
4379     redo A;
4380     } elsif ($self->{nc} == 0x004E or # N
4381     $self->{nc} == 0x006E) { # n
4382    
4383     $self->{state} = NDATA_STATE;
4384     $self->{kwd} = chr $self->{nc};
4385    
4386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387     $self->{line_prev} = $self->{line};
4388     $self->{column_prev} = $self->{column};
4389     $self->{column}++;
4390     $self->{nc}
4391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392     } else {
4393     $self->{set_nc}->($self);
4394     }
4395    
4396     redo A;
4397     } elsif ($self->{nc} == -1) {
4398    
4399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401     ## reconsume
4402     return ($self->{ct}); # ENTITY
4403     redo A;
4404     } else {
4405    
4406     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407     $self->{state} = BOGUS_MD_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     }
4421 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422     if ($self->{nc} == 0x003E) { # >
4423    
4424     $self->{state} = DATA_STATE;
4425 wakaba 1.5 $self->{s_kwd} = '';
4426 wakaba 1.1
4427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428     $self->{line_prev} = $self->{line};
4429     $self->{column_prev} = $self->{column};
4430     $self->{column}++;
4431     $self->{nc}
4432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433     } else {
4434     $self->{set_nc}->($self);
4435     }
4436    
4437    
4438     return ($self->{ct}); # DOCTYPE
4439    
4440     redo A;
4441 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442 wakaba 1.13
4443     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445     $self->{in_subset} = 1;
4446    
4447 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448     $self->{line_prev} = $self->{line};
4449     $self->{column_prev} = $self->{column};
4450     $self->{column}++;
4451     $self->{nc}
4452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453     } else {
4454     $self->{set_nc}->($self);
4455     }
4456    
4457 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4458     redo A;
4459 wakaba 1.1 } elsif ($self->{nc} == -1) {
4460    
4461     $self->{state} = DATA_STATE;
4462 wakaba 1.5 $self->{s_kwd} = '';
4463 wakaba 1.1 ## reconsume
4464    
4465     return ($self->{ct}); # DOCTYPE
4466    
4467     redo A;
4468     } else {
4469    
4470     my $s = '';
4471 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4472 wakaba 1.1
4473     ## Stay in the state
4474    
4475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476     $self->{line_prev} = $self->{line};
4477     $self->{column_prev} = $self->{column};
4478     $self->{column}++;
4479     $self->{nc}
4480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481     } else {
4482     $self->{set_nc}->($self);
4483     }
4484    
4485     redo A;
4486     }
4487     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488     ## NOTE: "CDATA section state" in the state is jointly implemented
4489     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490     ## and |CDATA_SECTION_MSE2_STATE|.
4491 wakaba 1.10
4492     ## XML5: "CDATA state".
4493 wakaba 1.1
4494     if ($self->{nc} == 0x005D) { # ]
4495    
4496     $self->{state} = CDATA_SECTION_MSE1_STATE;
4497    
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510 wakaba 1.6 if ($self->{is_xml}) {
4511 wakaba 1.8
4512 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 wakaba 1.8 } else {
4514    
4515 wakaba 1.6 }
4516    
4517 wakaba 1.1 $self->{state} = DATA_STATE;
4518 wakaba 1.5 $self->{s_kwd} = '';
4519 wakaba 1.10 ## Reconsume.
4520 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4521    
4522     return ($self->{ct}); # character
4523     } else {
4524    
4525     ## No token to emit. $self->{ct} is discarded.
4526     }
4527     redo A;
4528     } else {
4529    
4530     $self->{ct}->{data} .= chr $self->{nc};
4531     $self->{read_until}->($self->{ct}->{data},
4532     q<]>,
4533     length $self->{ct}->{data});
4534    
4535     ## Stay in the state.
4536    
4537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538     $self->{line_prev} = $self->{line};
4539     $self->{column_prev} = $self->{column};
4540     $self->{column}++;
4541     $self->{nc}
4542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543     } else {
4544     $self->{set_nc}->($self);
4545     }
4546    
4547     redo A;
4548     }
4549    
4550     ## ISSUE: "text tokens" in spec.
4551     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 wakaba 1.10 ## XML5: "CDATA bracket state".
4553    
4554 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4555    
4556     $self->{state} = CDATA_SECTION_MSE2_STATE;
4557    
4558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559     $self->{line_prev} = $self->{line};
4560     $self->{column_prev} = $self->{column};
4561     $self->{column}++;
4562     $self->{nc}
4563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564     } else {
4565     $self->{set_nc}->($self);
4566     }
4567    
4568     redo A;
4569     } else {
4570    
4571 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 wakaba 1.1 $self->{ct}->{data} .= ']';
4573 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 wakaba 1.1 ## Reconsume.
4575     redo A;
4576     }
4577     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 wakaba 1.10 ## XML5: "CDATA end state".
4579    
4580 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4581     $self->{state} = DATA_STATE;
4582 wakaba 1.5 $self->{s_kwd} = '';
4583 wakaba 1.1
4584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585     $self->{line_prev} = $self->{line};
4586     $self->{column_prev} = $self->{column};
4587     $self->{column}++;
4588     $self->{nc}
4589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590     } else {
4591     $self->{set_nc}->($self);
4592     }
4593    
4594     if (length $self->{ct}->{data}) { # character
4595    
4596     return ($self->{ct}); # character
4597     } else {
4598    
4599     ## No token to emit. $self->{ct} is discarded.
4600     }
4601     redo A;
4602     } elsif ($self->{nc} == 0x005D) { # ]
4603     # character
4604     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605     ## Stay in the state.
4606    
4607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608     $self->{line_prev} = $self->{line};
4609     $self->{column_prev} = $self->{column};
4610     $self->{column}++;
4611     $self->{nc}
4612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613     } else {
4614     $self->{set_nc}->($self);
4615     }
4616    
4617     redo A;
4618     } else {
4619    
4620     $self->{ct}->{data} .= ']]'; # character
4621     $self->{state} = CDATA_SECTION_STATE;
4622 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4623 wakaba 1.1 redo A;
4624     }
4625     } elsif ($self->{state} == ENTITY_STATE) {
4626     if ($is_space->{$self->{nc}} or
4627     {
4628     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629     $self->{entity_add} => 1,
4630     }->{$self->{nc}}) {
4631    
4632     ## Don't consume
4633     ## No error
4634     ## Return nothing.
4635     #
4636     } elsif ($self->{nc} == 0x0023) { # #
4637    
4638     $self->{state} = ENTITY_HASH_STATE;
4639 wakaba 1.12 $self->{kwd} = '#';
4640 wakaba 1.1
4641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642     $self->{line_prev} = $self->{line};
4643     $self->{column_prev} = $self->{column};
4644     $self->{column}++;
4645     $self->{nc}
4646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647     } else {
4648     $self->{set_nc}->($self);
4649     }
4650    
4651     redo A;
4652     } elsif ((0x0041 <= $self->{nc} and
4653     $self->{nc} <= 0x005A) or # A..Z
4654     (0x0061 <= $self->{nc} and
4655     $self->{nc} <= 0x007A)) { # a..z
4656    
4657     require Whatpm::_NamedEntityList;
4658     $self->{state} = ENTITY_NAME_STATE;
4659 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4660     $self->{entity__value} = $self->{kwd};
4661 wakaba 1.1 $self->{entity__match} = 0;
4662    
4663     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4664     $self->{line_prev} = $self->{line};
4665     $self->{column_prev} = $self->{column};
4666     $self->{column}++;
4667     $self->{nc}
4668     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4669     } else {
4670     $self->{set_nc}->($self);
4671     }
4672    
4673     redo A;
4674     } else {
4675    
4676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4677     ## Return nothing.
4678     #
4679     }
4680    
4681     ## NOTE: No character is consumed by the "consume a character
4682     ## reference" algorithm. In other word, there is an "&" character
4683     ## that does not introduce a character reference, which would be
4684     ## appended to the parent element or the attribute value in later
4685     ## process of the tokenizer.
4686    
4687     if ($self->{prev_state} == DATA_STATE) {
4688    
4689     $self->{state} = $self->{prev_state};
4690 wakaba 1.5 $self->{s_kwd} = '';
4691 wakaba 1.1 ## Reconsume.
4692     return ({type => CHARACTER_TOKEN, data => '&',
4693     line => $self->{line_prev},
4694     column => $self->{column_prev},
4695     });
4696     redo A;
4697     } else {
4698    
4699     $self->{ca}->{value} .= '&';
4700     $self->{state} = $self->{prev_state};
4701 wakaba 1.5 $self->{s_kwd} = '';
4702 wakaba 1.1 ## Reconsume.
4703     redo A;
4704     }
4705     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4706 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4707 wakaba 1.1
4708     $self->{state} = HEXREF_X_STATE;
4709 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4710 wakaba 1.1
4711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4712     $self->{line_prev} = $self->{line};
4713     $self->{column_prev} = $self->{column};
4714     $self->{column}++;
4715     $self->{nc}
4716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4717     } else {
4718     $self->{set_nc}->($self);
4719     }
4720    
4721     redo A;
4722 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4723    
4724     if ($self->{is_xml}) {
4725     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4726     }
4727     $self->{state} = HEXREF_X_STATE;
4728     $self->{kwd} .= chr $self->{nc};
4729    
4730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731     $self->{line_prev} = $self->{line};
4732     $self->{column_prev} = $self->{column};
4733     $self->{column}++;
4734     $self->{nc}
4735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736     } else {
4737     $self->{set_nc}->($self);
4738     }
4739    
4740     redo A;
4741 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4742     $self->{nc} <= 0x0039) { # 0..9
4743    
4744     $self->{state} = NCR_NUM_STATE;
4745 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4746 wakaba 1.1
4747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4748     $self->{line_prev} = $self->{line};
4749     $self->{column_prev} = $self->{column};
4750     $self->{column}++;
4751     $self->{nc}
4752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4753     } else {
4754     $self->{set_nc}->($self);
4755     }
4756    
4757     redo A;
4758     } else {
4759     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4760     line => $self->{line_prev},
4761     column => $self->{column_prev} - 1);
4762    
4763     ## NOTE: According to the spec algorithm, nothing is returned,
4764     ## and then "&#" is appended to the parent element or the attribute
4765     ## value in the later processing.
4766    
4767     if ($self->{prev_state} == DATA_STATE) {
4768    
4769     $self->{state} = $self->{prev_state};
4770 wakaba 1.5 $self->{s_kwd} = '';
4771 wakaba 1.1 ## Reconsume.
4772     return ({type => CHARACTER_TOKEN,
4773     data => '&#',
4774     line => $self->{line_prev},
4775     column => $self->{column_prev} - 1,
4776     });
4777     redo A;
4778     } else {
4779    
4780     $self->{ca}->{value} .= '&#';
4781     $self->{state} = $self->{prev_state};
4782 wakaba 1.5 $self->{s_kwd} = '';
4783 wakaba 1.1 ## Reconsume.
4784     redo A;
4785     }
4786     }
4787     } elsif ($self->{state} == NCR_NUM_STATE) {
4788     if (0x0030 <= $self->{nc} and
4789     $self->{nc} <= 0x0039) { # 0..9
4790    
4791 wakaba 1.12 $self->{kwd} *= 10;
4792     $self->{kwd} += $self->{nc} - 0x0030;
4793 wakaba 1.1
4794     ## Stay in the state.
4795    
4796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4797     $self->{line_prev} = $self->{line};
4798     $self->{column_prev} = $self->{column};
4799     $self->{column}++;
4800     $self->{nc}
4801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4802     } else {
4803     $self->{set_nc}->($self);
4804     }
4805    
4806     redo A;
4807     } elsif ($self->{nc} == 0x003B) { # ;
4808    
4809    
4810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4811     $self->{line_prev} = $self->{line};
4812     $self->{column_prev} = $self->{column};
4813     $self->{column}++;
4814     $self->{nc}
4815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4816     } else {
4817     $self->{set_nc}->($self);
4818     }
4819    
4820     #
4821     } else {
4822    
4823     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4824     ## Reconsume.
4825     #
4826     }
4827    
4828 wakaba 1.12 my $code = $self->{kwd};
4829 wakaba 1.1 my $l = $self->{line_prev};
4830     my $c = $self->{column_prev};
4831     if ($charref_map->{$code}) {
4832    
4833     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4834     text => (sprintf 'U+%04X', $code),
4835     line => $l, column => $c);
4836     $code = $charref_map->{$code};
4837     } elsif ($code > 0x10FFFF) {
4838    
4839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4840     text => (sprintf 'U-%08X', $code),
4841     line => $l, column => $c);
4842     $code = 0xFFFD;
4843     }
4844    
4845     if ($self->{prev_state} == DATA_STATE) {
4846    
4847     $self->{state} = $self->{prev_state};
4848 wakaba 1.5 $self->{s_kwd} = '';
4849 wakaba 1.1 ## Reconsume.
4850     return ({type => CHARACTER_TOKEN, data => chr $code,
4851 wakaba 1.7 has_reference => 1,
4852 wakaba 1.1 line => $l, column => $c,
4853     });
4854     redo A;
4855     } else {
4856    
4857     $self->{ca}->{value} .= chr $code;
4858     $self->{ca}->{has_reference} = 1;
4859     $self->{state} = $self->{prev_state};
4860 wakaba 1.5 $self->{s_kwd} = '';
4861 wakaba 1.1 ## Reconsume.
4862     redo A;
4863     }
4864     } elsif ($self->{state} == HEXREF_X_STATE) {
4865     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4866     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4867     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4868     # 0..9, A..F, a..f
4869    
4870     $self->{state} = HEXREF_HEX_STATE;
4871 wakaba 1.12 $self->{kwd} = 0;
4872 wakaba 1.1 ## Reconsume.
4873     redo A;
4874     } else {
4875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4876     line => $self->{line_prev},
4877     column => $self->{column_prev} - 2);
4878    
4879     ## NOTE: According to the spec algorithm, nothing is returned,
4880     ## and then "&#" followed by "X" or "x" is appended to the parent
4881     ## element or the attribute value in the later processing.
4882    
4883     if ($self->{prev_state} == DATA_STATE) {
4884    
4885     $self->{state} = $self->{prev_state};
4886 wakaba 1.5 $self->{s_kwd} = '';
4887 wakaba 1.1 ## Reconsume.
4888     return ({type => CHARACTER_TOKEN,
4889 wakaba 1.12 data => '&' . $self->{kwd},
4890 wakaba 1.1 line => $self->{line_prev},
4891 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4892 wakaba 1.1 });
4893     redo A;
4894     } else {
4895    
4896 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4897 wakaba 1.1 $self->{state} = $self->{prev_state};
4898 wakaba 1.5 $self->{s_kwd} = '';
4899 wakaba 1.1 ## Reconsume.
4900     redo A;
4901     }
4902     }
4903     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4904     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4905     # 0..9
4906    
4907 wakaba 1.12 $self->{kwd} *= 0x10;
4908     $self->{kwd} += $self->{nc} - 0x0030;
4909 wakaba 1.1 ## Stay in the state.
4910    
4911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4912     $self->{line_prev} = $self->{line};
4913     $self->{column_prev} = $self->{column};
4914     $self->{column}++;
4915     $self->{nc}
4916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4917     } else {
4918     $self->{set_nc}->($self);
4919     }
4920    
4921     redo A;
4922     } elsif (0x0061 <= $self->{nc} and
4923     $self->{nc} <= 0x0066) { # a..f
4924    
4925 wakaba 1.12 $self->{kwd} *= 0x10;
4926     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4927 wakaba 1.1 ## Stay in the state.
4928    
4929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4930     $self->{line_prev} = $self->{line};
4931     $self->{column_prev} = $self->{column};
4932     $self->{column}++;
4933     $self->{nc}
4934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4935     } else {
4936     $self->{set_nc}->($self);
4937     }
4938    
4939     redo A;
4940     } elsif (0x0041 <= $self->{nc} and
4941     $self->{nc} <= 0x0046) { # A..F
4942    
4943 wakaba 1.12 $self->{kwd} *= 0x10;
4944     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4945 wakaba 1.1 ## Stay in the state.
4946    
4947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4948     $self->{line_prev} = $self->{line};
4949     $self->{column_prev} = $self->{column};
4950     $self->{column}++;
4951     $self->{nc}
4952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4953     } else {
4954     $self->{set_nc}->($self);
4955     }
4956    
4957     redo A;
4958     } elsif ($self->{nc} == 0x003B) { # ;
4959    
4960    
4961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4962     $self->{line_prev} = $self->{line};
4963     $self->{column_prev} = $self->{column};
4964     $self->{column}++;
4965     $self->{nc}
4966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4967     } else {
4968     $self->{set_nc}->($self);
4969     }
4970    
4971     #
4972     } else {
4973    
4974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4975     line => $self->{line},
4976     column => $self->{column});
4977     ## Reconsume.
4978     #
4979     }
4980    
4981 wakaba 1.12 my $code = $self->{kwd};
4982 wakaba 1.1 my $l = $self->{line_prev};
4983     my $c = $self->{column_prev};
4984     if ($charref_map->{$code}) {
4985    
4986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4987     text => (sprintf 'U+%04X', $code),
4988     line => $l, column => $c);
4989     $code = $charref_map->{$code};
4990     } elsif ($code > 0x10FFFF) {
4991    
4992     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4993     text => (sprintf 'U-%08X', $code),
4994     line => $l, column => $c);
4995     $code = 0xFFFD;
4996     }
4997    
4998     if ($self->{prev_state} == DATA_STATE) {
4999    
5000     $self->{state} = $self->{prev_state};
5001 wakaba 1.5 $self->{s_kwd} = '';
5002 wakaba 1.1 ## Reconsume.
5003     return ({type => CHARACTER_TOKEN, data => chr $code,
5004 wakaba 1.7 has_reference => 1,
5005 wakaba 1.1 line => $l, column => $c,
5006     });
5007     redo A;
5008     } else {
5009    
5010     $self->{ca}->{value} .= chr $code;
5011     $self->{ca}->{has_reference} = 1;
5012     $self->{state} = $self->{prev_state};
5013 wakaba 1.5 $self->{s_kwd} = '';
5014 wakaba 1.1 ## Reconsume.
5015     redo A;
5016     }
5017     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5018 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5019     $self->{nc} <= 0x005A) or # x
5020     (0x0061 <= $self->{nc} and # a
5021     $self->{nc} <= 0x007A) or # z
5022     (0x0030 <= $self->{nc} and # 0
5023     $self->{nc} <= 0x0039) or # 9
5024     $self->{nc} == 0x003B) { # ;
5025 wakaba 1.1 our $EntityChar;
5026 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5027 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5028     $self->{ge}->{$self->{kwd}}) {
5029 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5030 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5031     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5032    
5033     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5034     } else {
5035     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5036    
5037     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5038     value => $self->{kwd});
5039     } else {
5040    
5041     }
5042     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5043     }
5044     } else {
5045     if ($self->{is_xml}) {
5046    
5047     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5048     value => $self->{kwd},
5049     level => {
5050     'amp;' => $self->{level}->{warn},
5051     'quot;' => $self->{level}->{warn},
5052     'lt;' => $self->{level}->{warn},
5053     'gt;' => $self->{level}->{warn},
5054     'apos;' => $self->{level}->{warn},
5055     }->{$self->{kwd}} ||
5056     $self->{level}->{must});
5057     } else {
5058    
5059     }
5060     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5061     }
5062 wakaba 1.1 $self->{entity__match} = 1;
5063    
5064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5065     $self->{line_prev} = $self->{line};
5066     $self->{column_prev} = $self->{column};
5067     $self->{column}++;
5068     $self->{nc}
5069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5070     } else {
5071     $self->{set_nc}->($self);
5072     }
5073    
5074     #
5075     } else {
5076    
5077 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5078 wakaba 1.1 $self->{entity__match} = -1;
5079     ## Stay in the state.
5080    
5081     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5082     $self->{line_prev} = $self->{line};
5083     $self->{column_prev} = $self->{column};
5084     $self->{column}++;
5085     $self->{nc}
5086     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5087     } else {
5088     $self->{set_nc}->($self);
5089     }
5090    
5091     redo A;
5092     }
5093     } else {
5094    
5095     $self->{entity__value} .= chr $self->{nc};
5096     $self->{entity__match} *= 2;
5097     ## Stay in the state.
5098    
5099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5100     $self->{line_prev} = $self->{line};
5101     $self->{column_prev} = $self->{column};
5102     $self->{column}++;
5103     $self->{nc}
5104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5105     } else {
5106     $self->{set_nc}->($self);
5107     }
5108    
5109     redo A;
5110     }
5111     }
5112    
5113     my $data;
5114     my $has_ref;
5115     if ($self->{entity__match} > 0) {
5116    
5117     $data = $self->{entity__value};
5118     $has_ref = 1;
5119     #
5120     } elsif ($self->{entity__match} < 0) {
5121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5122     if ($self->{prev_state} != DATA_STATE and # in attribute
5123     $self->{entity__match} < -1) {
5124    
5125 wakaba 1.12 $data = '&' . $self->{kwd};
5126 wakaba 1.1 #
5127     } else {
5128    
5129     $data = $self->{entity__value};
5130     $has_ref = 1;
5131     #
5132     }
5133     } else {
5134    
5135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5136     line => $self->{line_prev},
5137 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5138     $data = '&' . $self->{kwd};
5139 wakaba 1.1 #
5140     }
5141    
5142     ## NOTE: In these cases, when a character reference is found,
5143     ## it is consumed and a character token is returned, or, otherwise,
5144     ## nothing is consumed and returned, according to the spec algorithm.
5145     ## In this implementation, anything that has been examined by the
5146     ## tokenizer is appended to the parent element or the attribute value
5147     ## as string, either literal string when no character reference or
5148     ## entity-replaced string otherwise, in this stage, since any characters
5149     ## that would not be consumed are appended in the data state or in an
5150     ## appropriate attribute value state anyway.
5151    
5152     if ($self->{prev_state} == DATA_STATE) {
5153    
5154     $self->{state} = $self->{prev_state};
5155 wakaba 1.5 $self->{s_kwd} = '';
5156 wakaba 1.1 ## Reconsume.
5157     return ({type => CHARACTER_TOKEN,
5158     data => $data,
5159 wakaba 1.7 has_reference => $has_ref,
5160 wakaba 1.1 line => $self->{line_prev},
5161 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5162 wakaba 1.1 });
5163     redo A;
5164     } else {
5165    
5166     $self->{ca}->{value} .= $data;
5167     $self->{ca}->{has_reference} = 1 if $has_ref;
5168     $self->{state} = $self->{prev_state};
5169 wakaba 1.5 $self->{s_kwd} = '';
5170 wakaba 1.1 ## Reconsume.
5171     redo A;
5172     }
5173 wakaba 1.8
5174     ## XML-only states
5175    
5176     } elsif ($self->{state} == PI_STATE) {
5177 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5178    
5179 wakaba 1.8 if ($is_space->{$self->{nc}} or
5180 wakaba 1.14 $self->{nc} == 0x003F or # ?
5181 wakaba 1.8 $self->{nc} == -1) {
5182 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5183     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5184     ## "DOCTYPE pi state": Parse error, switch to the "data
5185     ## state".
5186 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5187     line => $self->{line_prev},
5188     column => $self->{column_prev}
5189     - 1 * ($self->{nc} != -1));
5190     $self->{state} = BOGUS_COMMENT_STATE;
5191     ## Reconsume.
5192     $self->{ct} = {type => COMMENT_TOKEN,
5193     data => '?',
5194     line => $self->{line_prev},
5195     column => $self->{column_prev}
5196     - 1 * ($self->{nc} != -1),
5197     };
5198     redo A;
5199     } else {
5200 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5201 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5202     target => chr $self->{nc},
5203     data => '',
5204     line => $self->{line_prev},
5205     column => $self->{column_prev} - 1,
5206     };
5207     $self->{state} = PI_TARGET_STATE;
5208    
5209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5210     $self->{line_prev} = $self->{line};
5211     $self->{column_prev} = $self->{column};
5212     $self->{column}++;
5213     $self->{nc}
5214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5215     } else {
5216     $self->{set_nc}->($self);
5217     }
5218    
5219     redo A;
5220     }
5221     } elsif ($self->{state} == PI_TARGET_STATE) {
5222     if ($is_space->{$self->{nc}}) {
5223     $self->{state} = PI_TARGET_AFTER_STATE;
5224    
5225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5226     $self->{line_prev} = $self->{line};
5227     $self->{column_prev} = $self->{column};
5228     $self->{column}++;
5229     $self->{nc}
5230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5231     } else {
5232     $self->{set_nc}->($self);
5233     }
5234    
5235     redo A;
5236     } elsif ($self->{nc} == -1) {
5237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5238 wakaba 1.13 if ($self->{in_subset}) {
5239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5240     } else {
5241     $self->{state} = DATA_STATE;
5242     $self->{s_kwd} = '';
5243     }
5244 wakaba 1.8 ## Reconsume.
5245     return ($self->{ct}); # pi
5246     redo A;
5247     } elsif ($self->{nc} == 0x003F) { # ?
5248     $self->{state} = PI_AFTER_STATE;
5249    
5250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5251     $self->{line_prev} = $self->{line};
5252     $self->{column_prev} = $self->{column};
5253     $self->{column}++;
5254     $self->{nc}
5255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5256     } else {
5257     $self->{set_nc}->($self);
5258     }
5259    
5260     redo A;
5261     } else {
5262     ## XML5: typo ("tag name" -> "target")
5263     $self->{ct}->{target} .= chr $self->{nc}; # pi
5264    
5265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266     $self->{line_prev} = $self->{line};
5267     $self->{column_prev} = $self->{column};
5268     $self->{column}++;
5269     $self->{nc}
5270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271     } else {
5272     $self->{set_nc}->($self);
5273     }
5274    
5275     redo A;
5276     }
5277     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5278     if ($is_space->{$self->{nc}}) {
5279     ## Stay in the state.
5280    
5281     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5282     $self->{line_prev} = $self->{line};
5283     $self->{column_prev} = $self->{column};
5284     $self->{column}++;
5285     $self->{nc}
5286     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5287     } else {
5288     $self->{set_nc}->($self);
5289     }
5290    
5291     redo A;
5292     } else {
5293     $self->{state} = PI_DATA_STATE;
5294     ## Reprocess.
5295     redo A;
5296     }
5297     } elsif ($self->{state} == PI_DATA_STATE) {
5298     if ($self->{nc} == 0x003F) { # ?
5299     $self->{state} = PI_DATA_AFTER_STATE;
5300    
5301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5302     $self->{line_prev} = $self->{line};
5303     $self->{column_prev} = $self->{column};
5304     $self->{column}++;
5305     $self->{nc}
5306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5307     } else {
5308     $self->{set_nc}->($self);
5309     }
5310    
5311     redo A;
5312     } elsif ($self->{nc} == -1) {
5313     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5314 wakaba 1.13 if ($self->{in_subset}) {
5315 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5316 wakaba 1.13 } else {
5317     $self->{state} = DATA_STATE;
5318     $self->{s_kwd} = '';
5319     }
5320 wakaba 1.8 ## Reprocess.
5321     return ($self->{ct}); # pi
5322     redo A;
5323     } else {
5324     $self->{ct}->{data} .= chr $self->{nc}; # pi
5325     $self->{read_until}->($self->{ct}->{data}, q[?],
5326     length $self->{ct}->{data});
5327     ## Stay in the state.
5328    
5329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5330     $self->{line_prev} = $self->{line};
5331     $self->{column_prev} = $self->{column};
5332     $self->{column}++;
5333     $self->{nc}
5334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5335     } else {
5336     $self->{set_nc}->($self);
5337     }
5338    
5339     ## Reprocess.
5340     redo A;
5341     }
5342     } elsif ($self->{state} == PI_AFTER_STATE) {
5343 wakaba 1.14 ## XML5: Part of "Pi after state".
5344    
5345 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5346 wakaba 1.13 if ($self->{in_subset}) {
5347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5348     } else {
5349     $self->{state} = DATA_STATE;
5350     $self->{s_kwd} = '';
5351     }
5352 wakaba 1.8
5353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5354     $self->{line_prev} = $self->{line};
5355     $self->{column_prev} = $self->{column};
5356     $self->{column}++;
5357     $self->{nc}
5358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5359     } else {
5360     $self->{set_nc}->($self);
5361     }
5362    
5363     return ($self->{ct}); # pi
5364     redo A;
5365     } elsif ($self->{nc} == 0x003F) { # ?
5366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5367     line => $self->{line_prev},
5368     column => $self->{column_prev}); ## XML5: no error
5369     $self->{ct}->{data} .= '?';
5370     $self->{state} = PI_DATA_AFTER_STATE;
5371    
5372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5373     $self->{line_prev} = $self->{line};
5374     $self->{column_prev} = $self->{column};
5375     $self->{column}++;
5376     $self->{nc}
5377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5378     } else {
5379     $self->{set_nc}->($self);
5380     }
5381    
5382     redo A;
5383     } else {
5384     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5385     line => $self->{line_prev},
5386     column => $self->{column_prev}
5387     + 1 * ($self->{nc} == -1)); ## XML5: no error
5388     $self->{ct}->{data} .= '?'; ## XML5: not appended
5389     $self->{state} = PI_DATA_STATE;
5390     ## Reprocess.
5391     redo A;
5392     }
5393     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5394 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5395    
5396 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5397 wakaba 1.13 if ($self->{in_subset}) {
5398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5399     } else {
5400     $self->{state} = DATA_STATE;
5401     $self->{s_kwd} = '';
5402     }
5403 wakaba 1.8
5404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5405     $self->{line_prev} = $self->{line};
5406     $self->{column_prev} = $self->{column};
5407     $self->{column}++;
5408     $self->{nc}
5409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5410     } else {
5411     $self->{set_nc}->($self);
5412     }
5413    
5414     return ($self->{ct}); # pi
5415     redo A;
5416     } elsif ($self->{nc} == 0x003F) { # ?
5417     $self->{ct}->{data} .= '?';
5418     ## Stay in the state.
5419    
5420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5421     $self->{line_prev} = $self->{line};
5422     $self->{column_prev} = $self->{column};
5423     $self->{column}++;
5424     $self->{nc}
5425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5426     } else {
5427     $self->{set_nc}->($self);
5428     }
5429    
5430     redo A;
5431     } else {
5432     $self->{ct}->{data} .= '?'; ## XML5: not appended
5433     $self->{state} = PI_DATA_STATE;
5434     ## Reprocess.
5435     redo A;
5436     }
5437 wakaba 1.12
5438     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5439     if ($self->{nc} == 0x003C) { # <
5440 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5441 wakaba 1.12
5442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5443     $self->{line_prev} = $self->{line};
5444     $self->{column_prev} = $self->{column};
5445     $self->{column}++;
5446     $self->{nc}
5447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5448     } else {
5449     $self->{set_nc}->($self);
5450     }
5451    
5452     redo A;
5453     } elsif ($self->{nc} == 0x0025) { # %
5454     ## XML5: Not defined yet.
5455    
5456     ## TODO:
5457    
5458     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5459     $self->{line_prev} = $self->{line};
5460     $self->{column_prev} = $self->{column};
5461     $self->{column}++;
5462     $self->{nc}
5463     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5464     } else {
5465     $self->{set_nc}->($self);
5466     }
5467    
5468     redo A;
5469     } elsif ($self->{nc} == 0x005D) { # ]
5470 wakaba 1.13 delete $self->{in_subset};
5471 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5472    
5473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474     $self->{line_prev} = $self->{line};
5475     $self->{column_prev} = $self->{column};
5476     $self->{column}++;
5477     $self->{nc}
5478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479     } else {
5480     $self->{set_nc}->($self);
5481     }
5482    
5483     redo A;
5484     } elsif ($is_space->{$self->{nc}}) {
5485     ## Stay in the state.
5486    
5487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5488     $self->{line_prev} = $self->{line};
5489     $self->{column_prev} = $self->{column};
5490     $self->{column}++;
5491     $self->{nc}
5492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5493     } else {
5494     $self->{set_nc}->($self);
5495     }
5496    
5497     redo A;
5498     } elsif ($self->{nc} == -1) {
5499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5500 wakaba 1.13 delete $self->{in_subset};
5501 wakaba 1.12 $self->{state} = DATA_STATE;
5502     $self->{s_kwd} = '';
5503     ## Reconsume.
5504 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5505 wakaba 1.12 redo A;
5506     } else {
5507     unless ($self->{internal_subset_tainted}) {
5508     ## XML5: No parse error.
5509     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5510     $self->{internal_subset_tainted} = 1;
5511     }
5512     ## Stay in the state.
5513    
5514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5515     $self->{line_prev} = $self->{line};
5516     $self->{column_prev} = $self->{column};
5517     $self->{column}++;
5518     $self->{nc}
5519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5520     } else {
5521     $self->{set_nc}->($self);
5522     }
5523    
5524     redo A;
5525     }
5526     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5527     if ($self->{nc} == 0x003E) { # >
5528     $self->{state} = DATA_STATE;
5529     $self->{s_kwd} = '';
5530    
5531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5532     $self->{line_prev} = $self->{line};
5533     $self->{column_prev} = $self->{column};
5534     $self->{column}++;
5535     $self->{nc}
5536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5537     } else {
5538     $self->{set_nc}->($self);
5539     }
5540    
5541 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5542 wakaba 1.12 redo A;
5543     } elsif ($self->{nc} == -1) {
5544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5545     $self->{state} = DATA_STATE;
5546     $self->{s_kwd} = '';
5547     ## Reconsume.
5548 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5549 wakaba 1.12 redo A;
5550     } else {
5551     ## XML5: No parse error and stay in the state.
5552     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5553    
5554 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5555    
5556     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5557     $self->{line_prev} = $self->{line};
5558     $self->{column_prev} = $self->{column};
5559     $self->{column}++;
5560     $self->{nc}
5561     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5562     } else {
5563     $self->{set_nc}->($self);
5564     }
5565    
5566     redo A;
5567     }
5568     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5569     if ($self->{nc} == 0x003E) { # >
5570     $self->{state} = DATA_STATE;
5571     $self->{s_kwd} = '';
5572    
5573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5574     $self->{line_prev} = $self->{line};
5575     $self->{column_prev} = $self->{column};
5576     $self->{column}++;
5577     $self->{nc}
5578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5579     } else {
5580     $self->{set_nc}->($self);
5581     }
5582    
5583     return ({type => END_OF_DOCTYPE_TOKEN});
5584     redo A;
5585     } elsif ($self->{nc} == -1) {
5586     $self->{state} = DATA_STATE;
5587     $self->{s_kwd} = '';
5588     ## Reconsume.
5589     return ({type => END_OF_DOCTYPE_TOKEN});
5590     redo A;
5591     } else {
5592     ## Stay in the state.
5593    
5594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5595     $self->{line_prev} = $self->{line};
5596     $self->{column_prev} = $self->{column};
5597     $self->{column}++;
5598     $self->{nc}
5599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5600     } else {
5601     $self->{set_nc}->($self);
5602     }
5603    
5604     redo A;
5605     }
5606     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5607     if ($self->{nc} == 0x0021) { # !
5608 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5609 wakaba 1.13
5610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5611     $self->{line_prev} = $self->{line};
5612     $self->{column_prev} = $self->{column};
5613     $self->{column}++;
5614     $self->{nc}
5615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5616     } else {
5617     $self->{set_nc}->($self);
5618     }
5619    
5620     redo A;
5621     } elsif ($self->{nc} == 0x003F) { # ?
5622     $self->{state} = PI_STATE;
5623    
5624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625     $self->{line_prev} = $self->{line};
5626     $self->{column_prev} = $self->{column};
5627     $self->{column}++;
5628     $self->{nc}
5629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630     } else {
5631     $self->{set_nc}->($self);
5632     }
5633    
5634     redo A;
5635     } elsif ($self->{nc} == -1) {
5636     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5637     $self->{state} = DATA_STATE;
5638     $self->{s_kwd} = '';
5639     ## Reconsume.
5640     redo A;
5641     } else {
5642     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5643     line => $self->{line_prev},
5644     column => $self->{column_prev});
5645     $self->{state} = BOGUS_COMMENT_STATE;
5646     $self->{ct} = {type => COMMENT_TOKEN,
5647     data => '',
5648     }; ## NOTE: Will be discarded.
5649 wakaba 1.12
5650     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5651     $self->{line_prev} = $self->{line};
5652     $self->{column_prev} = $self->{column};
5653     $self->{column}++;
5654     $self->{nc}
5655     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5656     } else {
5657     $self->{set_nc}->($self);
5658     }
5659    
5660     redo A;
5661     }
5662 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5663     ## XML5: "DOCTYPE markup declaration state".
5664    
5665     if ($self->{nc} == 0x002D) { # -
5666     $self->{state} = MD_HYPHEN_STATE;
5667    
5668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669     $self->{line_prev} = $self->{line};
5670     $self->{column_prev} = $self->{column};
5671     $self->{column}++;
5672     $self->{nc}
5673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674     } else {
5675     $self->{set_nc}->($self);
5676     }
5677    
5678     redo A;
5679 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5680     $self->{nc} == 0x0065) { # e
5681 wakaba 1.14 $self->{state} = MD_E_STATE;
5682     $self->{kwd} = chr $self->{nc};
5683    
5684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5685     $self->{line_prev} = $self->{line};
5686     $self->{column_prev} = $self->{column};
5687     $self->{column}++;
5688     $self->{nc}
5689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5690     } else {
5691     $self->{set_nc}->($self);
5692     }
5693    
5694     redo A;
5695 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5696     $self->{nc} == 0x0061) { # a
5697 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5698     $self->{kwd} = chr $self->{nc};
5699    
5700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5701     $self->{line_prev} = $self->{line};
5702     $self->{column_prev} = $self->{column};
5703     $self->{column}++;
5704     $self->{nc}
5705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5706     } else {
5707     $self->{set_nc}->($self);
5708     }
5709    
5710     redo A;
5711 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5712     $self->{nc} == 0x006E) { # n
5713 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5714     $self->{kwd} = chr $self->{nc};
5715    
5716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5717     $self->{line_prev} = $self->{line};
5718     $self->{column_prev} = $self->{column};
5719     $self->{column}++;
5720     $self->{nc}
5721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5722     } else {
5723     $self->{set_nc}->($self);
5724     }
5725    
5726     redo A;
5727     } else {
5728     #
5729     }
5730    
5731     ## XML5: No parse error.
5732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5733     line => $self->{line_prev},
5734     column => $self->{column_prev} - 1);
5735     ## Reconsume.
5736     $self->{state} = BOGUS_COMMENT_STATE;
5737     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5738     redo A;
5739     } elsif ($self->{state} == MD_E_STATE) {
5740 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5741     $self->{nc} == 0x006E) { # n
5742 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5743     $self->{kwd} .= chr $self->{nc};
5744    
5745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746     $self->{line_prev} = $self->{line};
5747     $self->{column_prev} = $self->{column};
5748     $self->{column}++;
5749     $self->{nc}
5750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751     } else {
5752     $self->{set_nc}->($self);
5753     }
5754    
5755     redo A;
5756 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5757     $self->{nc} == 0x006C) { # l
5758 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5759     $self->{state} = MD_ELEMENT_STATE;
5760     $self->{kwd} .= chr $self->{nc};
5761    
5762     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5763     $self->{line_prev} = $self->{line};
5764     $self->{column_prev} = $self->{column};
5765     $self->{column}++;
5766     $self->{nc}
5767     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5768     } else {
5769     $self->{set_nc}->($self);
5770     }
5771    
5772     redo A;
5773     } else {
5774     ## XML5: No parse error.
5775     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5776     line => $self->{line_prev},
5777     column => $self->{column_prev} - 2
5778     + 1 * ($self->{nc} == -1));
5779     ## Reconsume.
5780     $self->{state} = BOGUS_COMMENT_STATE;
5781     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5782     redo A;
5783     }
5784     } elsif ($self->{state} == MD_ENTITY_STATE) {
5785 wakaba 1.17 if ($self->{nc} == [
5786     undef,
5787     undef,
5788     0x0054, # T
5789     0x0049, # I
5790     0x0054, # T
5791     ]->[length $self->{kwd}] or
5792     $self->{nc} == [
5793     undef,
5794     undef,
5795     0x0074, # t
5796     0x0069, # i
5797     0x0074, # t
5798     ]->[length $self->{kwd}]) {
5799 wakaba 1.14 ## Stay in the state.
5800     $self->{kwd} .= chr $self->{nc};
5801    
5802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5803     $self->{line_prev} = $self->{line};
5804     $self->{column_prev} = $self->{column};
5805     $self->{column}++;
5806     $self->{nc}
5807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5808     } else {
5809     $self->{set_nc}->($self);
5810     }
5811    
5812     redo A;
5813 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5814     ($self->{nc} == 0x0059 or # Y
5815     $self->{nc} == 0x0079)) { # y
5816     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5817     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5818     text => 'ENTITY',
5819     line => $self->{line_prev},
5820     column => $self->{column_prev} - 4);
5821     }
5822     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5823 wakaba 1.14 line => $self->{line_prev},
5824     column => $self->{column_prev} - 6};
5825     $self->{state} = DOCTYPE_MD_STATE;
5826    
5827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5828     $self->{line_prev} = $self->{line};
5829     $self->{column_prev} = $self->{column};
5830     $self->{column}++;
5831     $self->{nc}
5832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5833     } else {
5834     $self->{set_nc}->($self);
5835     }
5836    
5837     redo A;
5838     } else {
5839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5840     line => $self->{line_prev},
5841     column => $self->{column_prev} - 1
5842     - (length $self->{kwd})
5843     + 1 * ($self->{nc} == -1));
5844     $self->{state} = BOGUS_COMMENT_STATE;
5845     ## Reconsume.
5846     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5847     redo A;
5848     }
5849     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5850 wakaba 1.17 if ($self->{nc} == [
5851     undef,
5852     undef,
5853     0x0045, # E
5854     0x004D, # M
5855     0x0045, # E
5856     0x004E, # N
5857     ]->[length $self->{kwd}] or
5858     $self->{nc} == [
5859     undef,
5860     undef,
5861     0x0065, # e
5862     0x006D, # m
5863     0x0065, # e
5864     0x006E, # n
5865     ]->[length $self->{kwd}]) {
5866 wakaba 1.14 ## Stay in the state.
5867     $self->{kwd} .= chr $self->{nc};
5868    
5869     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5870     $self->{line_prev} = $self->{line};
5871     $self->{column_prev} = $self->{column};
5872     $self->{column}++;
5873     $self->{nc}
5874     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5875     } else {
5876     $self->{set_nc}->($self);
5877     }
5878    
5879     redo A;
5880 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5881     ($self->{nc} == 0x0054 or # T
5882     $self->{nc} == 0x0074)) { # t
5883     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5885     text => 'ELEMENT',
5886     line => $self->{line_prev},
5887     column => $self->{column_prev} - 5);
5888     }
5889 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5890     line => $self->{line_prev},
5891     column => $self->{column_prev} - 6};
5892     $self->{state} = DOCTYPE_MD_STATE;
5893    
5894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895     $self->{line_prev} = $self->{line};
5896     $self->{column_prev} = $self->{column};
5897     $self->{column}++;
5898     $self->{nc}
5899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900     } else {
5901     $self->{set_nc}->($self);
5902     }
5903    
5904     redo A;
5905     } else {
5906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5907     line => $self->{line_prev},
5908     column => $self->{column_prev} - 1
5909     - (length $self->{kwd})
5910     + 1 * ($self->{nc} == -1));
5911     $self->{state} = BOGUS_COMMENT_STATE;
5912     ## Reconsume.
5913     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5914     redo A;
5915     }
5916     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5917 wakaba 1.17 if ($self->{nc} == [
5918     undef,
5919     0x0054, # T
5920     0x0054, # T
5921     0x004C, # L
5922     0x0049, # I
5923     0x0053, # S
5924     ]->[length $self->{kwd}] or
5925     $self->{nc} == [
5926     undef,
5927     0x0074, # t
5928     0x0074, # t
5929     0x006C, # l
5930     0x0069, # i
5931     0x0073, # s
5932     ]->[length $self->{kwd}]) {
5933 wakaba 1.14 ## Stay in the state.
5934     $self->{kwd} .= chr $self->{nc};
5935    
5936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5937     $self->{line_prev} = $self->{line};
5938     $self->{column_prev} = $self->{column};
5939     $self->{column}++;
5940     $self->{nc}
5941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5942     } else {
5943     $self->{set_nc}->($self);
5944     }
5945    
5946     redo A;
5947 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5948     ($self->{nc} == 0x0054 or # T
5949     $self->{nc} == 0x0074)) { # t
5950     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5952     text => 'ATTLIST',
5953     line => $self->{line_prev},
5954     column => $self->{column_prev} - 5);
5955     }
5956 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5957 wakaba 1.15 attrdefs => [],
5958 wakaba 1.14 line => $self->{line_prev},
5959     column => $self->{column_prev} - 6};
5960     $self->{state} = DOCTYPE_MD_STATE;
5961    
5962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5963     $self->{line_prev} = $self->{line};
5964     $self->{column_prev} = $self->{column};
5965     $self->{column}++;
5966     $self->{nc}
5967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5968     } else {
5969     $self->{set_nc}->($self);
5970     }
5971    
5972     redo A;
5973     } else {
5974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5975     line => $self->{line_prev},
5976     column => $self->{column_prev} - 1
5977     - (length $self->{kwd})
5978     + 1 * ($self->{nc} == -1));
5979     $self->{state} = BOGUS_COMMENT_STATE;
5980     ## Reconsume.
5981     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5982     redo A;
5983     }
5984     } elsif ($self->{state} == MD_NOTATION_STATE) {
5985 wakaba 1.17 if ($self->{nc} == [
5986     undef,
5987     0x004F, # O
5988     0x0054, # T
5989     0x0041, # A
5990     0x0054, # T
5991     0x0049, # I
5992     0x004F, # O
5993     ]->[length $self->{kwd}] or
5994     $self->{nc} == [
5995     undef,
5996     0x006F, # o
5997     0x0074, # t
5998     0x0061, # a
5999     0x0074, # t
6000     0x0069, # i
6001     0x006F, # o
6002     ]->[length $self->{kwd}]) {
6003 wakaba 1.14 ## Stay in the state.
6004     $self->{kwd} .= chr $self->{nc};
6005    
6006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6007     $self->{line_prev} = $self->{line};
6008     $self->{column_prev} = $self->{column};
6009     $self->{column}++;
6010     $self->{nc}
6011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6012     } else {
6013     $self->{set_nc}->($self);
6014     }
6015    
6016     redo A;
6017 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6018     ($self->{nc} == 0x004E or # N
6019     $self->{nc} == 0x006E)) { # n
6020     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6021     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6022     text => 'NOTATION',
6023     line => $self->{line_prev},
6024     column => $self->{column_prev} - 6);
6025     }
6026 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6027     line => $self->{line_prev},
6028     column => $self->{column_prev} - 6};
6029     $self->{state} = DOCTYPE_MD_STATE;
6030    
6031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032     $self->{line_prev} = $self->{line};
6033     $self->{column_prev} = $self->{column};
6034     $self->{column}++;
6035     $self->{nc}
6036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037     } else {
6038     $self->{set_nc}->($self);
6039     }
6040    
6041     redo A;
6042     } else {
6043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6044     line => $self->{line_prev},
6045     column => $self->{column_prev} - 1
6046     - (length $self->{kwd})
6047     + 1 * ($self->{nc} == -1));
6048     $self->{state} = BOGUS_COMMENT_STATE;
6049     ## Reconsume.
6050     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6051     redo A;
6052     }
6053     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6054     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6055     ## "DOCTYPE NOTATION state".
6056    
6057     if ($is_space->{$self->{nc}}) {
6058     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6059     $self->{state} = BEFORE_MD_NAME_STATE;
6060    
6061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6062     $self->{line_prev} = $self->{line};
6063     $self->{column_prev} = $self->{column};
6064     $self->{column}++;
6065     $self->{nc}
6066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6067     } else {
6068     $self->{set_nc}->($self);
6069     }
6070    
6071     redo A;
6072     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6073     $self->{nc} == 0x0025) { # %
6074     ## XML5: Switch to the "DOCTYPE bogus comment state".
6075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6076     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6077    
6078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6079     $self->{line_prev} = $self->{line};
6080     $self->{column_prev} = $self->{column};
6081     $self->{column}++;
6082     $self->{nc}
6083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6084     } else {
6085     $self->{set_nc}->($self);
6086     }
6087    
6088     redo A;
6089     } elsif ($self->{nc} == -1) {
6090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6091     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6092     ## Reconsume.
6093     redo A;
6094     } elsif ($self->{nc} == 0x003E) { # >
6095     ## XML5: Switch to the "DOCTYPE bogus comment state".
6096     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6097     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6098    
6099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6100     $self->{line_prev} = $self->{line};
6101     $self->{column_prev} = $self->{column};
6102     $self->{column}++;
6103     $self->{nc}
6104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6105     } else {
6106     $self->{set_nc}->($self);
6107     }
6108    
6109     redo A;
6110     } else {
6111     ## XML5: Switch to the "DOCTYPE bogus comment state".
6112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6113     $self->{state} = BEFORE_MD_NAME_STATE;
6114     redo A;
6115     }
6116     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6117     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6118     ## before state", "DOCTYPE ATTLIST name before state".
6119    
6120     if ($is_space->{$self->{nc}}) {
6121     ## Stay in the state.
6122    
6123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6124     $self->{line_prev} = $self->{line};
6125     $self->{column_prev} = $self->{column};
6126     $self->{column}++;
6127     $self->{nc}
6128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6129     } else {
6130     $self->{set_nc}->($self);
6131     }
6132    
6133     redo A;
6134     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6135     $self->{nc} == 0x0025) { # %
6136     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6137    
6138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139     $self->{line_prev} = $self->{line};
6140     $self->{column_prev} = $self->{column};
6141     $self->{column}++;
6142     $self->{nc}
6143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144     } else {
6145     $self->{set_nc}->($self);
6146     }
6147    
6148     redo A;
6149     } elsif ($self->{nc} == 0x003E) { # >
6150     ## XML5: Same as "Anything else".
6151     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6152     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6153    
6154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155     $self->{line_prev} = $self->{line};
6156     $self->{column_prev} = $self->{column};
6157     $self->{column}++;
6158     $self->{nc}
6159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6160     } else {
6161     $self->{set_nc}->($self);
6162     }
6163    
6164     redo A;
6165     } elsif ($self->{nc} == -1) {
6166     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6167     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6168     ## Reconsume.
6169     redo A;
6170     } else {
6171     ## XML5: [ATTLIST] Not defined yet.
6172     $self->{ct}->{name} .= chr $self->{nc};
6173     $self->{state} = MD_NAME_STATE;
6174    
6175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6176     $self->{line_prev} = $self->{line};
6177     $self->{column_prev} = $self->{column};
6178     $self->{column}++;
6179     $self->{nc}
6180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6181     } else {
6182     $self->{set_nc}->($self);
6183     }
6184    
6185     redo A;
6186     }
6187     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6188     if ($is_space->{$self->{nc}}) {
6189     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6190     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6191     $self->{state} = BEFORE_MD_NAME_STATE;
6192 wakaba 1.8
6193 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6194     $self->{line_prev} = $self->{line};
6195     $self->{column_prev} = $self->{column};
6196     $self->{column}++;
6197     $self->{nc}
6198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6199     } else {
6200     $self->{set_nc}->($self);
6201     }
6202    
6203     redo A;
6204     } elsif ($self->{nc} == 0x003E) { # >
6205     ## XML5: Same as "Anything else".
6206     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6208    
6209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210     $self->{line_prev} = $self->{line};
6211     $self->{column_prev} = $self->{column};
6212     $self->{column}++;
6213     $self->{nc}
6214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215     } else {
6216     $self->{set_nc}->($self);
6217     }
6218    
6219     redo A;
6220     } elsif ($self->{nc} == -1) {
6221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6222     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6223     ## Reconsume.
6224     redo A;
6225     } else {
6226     ## XML5: No parse error.
6227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6228     $self->{state} = BOGUS_COMMENT_STATE;
6229     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6230     ## Reconsume.
6231     redo A;
6232     }
6233     } elsif ($self->{state} == MD_NAME_STATE) {
6234     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6235    
6236     if ($is_space->{$self->{nc}}) {
6237 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6238     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6239     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6240 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6241 wakaba 1.16 } else { # ENTITY/NOTATION
6242     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6243     }
6244 wakaba 1.14
6245     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6246     $self->{line_prev} = $self->{line};
6247     $self->{column_prev} = $self->{column};
6248     $self->{column}++;
6249     $self->{nc}
6250     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6251     } else {
6252     $self->{set_nc}->($self);
6253     }
6254    
6255     redo A;
6256     } elsif ($self->{nc} == 0x003E) { # >
6257     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6258     #
6259     } else {
6260 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6261 wakaba 1.14 }
6262     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263    
6264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265     $self->{line_prev} = $self->{line};
6266     $self->{column_prev} = $self->{column};
6267     $self->{column}++;
6268     $self->{nc}
6269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270     } else {
6271     $self->{set_nc}->($self);
6272     }
6273    
6274     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6275     redo A;
6276     } elsif ($self->{nc} == -1) {
6277     ## XML5: [ATTLIST] No parse error.
6278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6279     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6280     ## Reconsume.
6281     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6282     redo A;
6283     } else {
6284     ## XML5: [ATTLIST] Not defined yet.
6285     $self->{ct}->{name} .= chr $self->{nc};
6286     ## Stay in the state.
6287    
6288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6289     $self->{line_prev} = $self->{line};
6290     $self->{column_prev} = $self->{column};
6291     $self->{column}++;
6292     $self->{nc}
6293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6294     } else {
6295     $self->{set_nc}->($self);
6296     }
6297    
6298     redo A;
6299     }
6300     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6301     if ($is_space->{$self->{nc}}) {
6302     ## Stay in the state.
6303    
6304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6305     $self->{line_prev} = $self->{line};
6306     $self->{column_prev} = $self->{column};
6307     $self->{column}++;
6308     $self->{nc}
6309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6310     } else {
6311     $self->{set_nc}->($self);
6312     }
6313    
6314     redo A;
6315     } elsif ($self->{nc} == 0x003E) { # >
6316     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6317    
6318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6319     $self->{line_prev} = $self->{line};
6320     $self->{column_prev} = $self->{column};
6321     $self->{column}++;
6322     $self->{nc}
6323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6324     } else {
6325     $self->{set_nc}->($self);
6326     }
6327    
6328     return ($self->{ct}); # ATTLIST
6329     redo A;
6330     } elsif ($self->{nc} == -1) {
6331     ## XML5: No parse error.
6332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6333     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6334 wakaba 1.15 return ($self->{ct});
6335 wakaba 1.14 redo A;
6336     } else {
6337     ## XML5: Not defined yet.
6338 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6339     tokens => [],
6340     line => $self->{line}, column => $self->{column}};
6341     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6342    
6343     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6344     $self->{line_prev} = $self->{line};
6345     $self->{column_prev} = $self->{column};
6346     $self->{column}++;
6347     $self->{nc}
6348     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6349     } else {
6350     $self->{set_nc}->($self);
6351     }
6352    
6353     redo A;
6354     }
6355     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6356     if ($is_space->{$self->{nc}}) {
6357     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6358    
6359     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6360     $self->{line_prev} = $self->{line};
6361     $self->{column_prev} = $self->{column};
6362     $self->{column}++;
6363     $self->{nc}
6364     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6365     } else {
6366     $self->{set_nc}->($self);
6367     }
6368    
6369     redo A;
6370     } elsif ($self->{nc} == 0x003E) { # >
6371     ## XML5: Same as "anything else".
6372     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6374    
6375     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6376     $self->{line_prev} = $self->{line};
6377     $self->{column_prev} = $self->{column};
6378     $self->{column}++;
6379     $self->{nc}
6380     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6381     } else {
6382     $self->{set_nc}->($self);
6383     }
6384    
6385     return ($self->{ct}); # ATTLIST
6386     redo A;
6387     } elsif ($self->{nc} == 0x0028) { # (
6388     ## XML5: Same as "anything else".
6389     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6390     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6391    
6392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393     $self->{line_prev} = $self->{line};
6394     $self->{column_prev} = $self->{column};
6395     $self->{column}++;
6396     $self->{nc}
6397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398     } else {
6399     $self->{set_nc}->($self);
6400     }
6401    
6402     redo A;
6403     } elsif ($self->{nc} == -1) {
6404     ## XML5: No parse error.
6405     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6406     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6407    
6408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409     $self->{line_prev} = $self->{line};
6410     $self->{column_prev} = $self->{column};
6411     $self->{column}++;
6412     $self->{nc}
6413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414     } else {
6415     $self->{set_nc}->($self);
6416     }
6417    
6418     return ($self->{ct}); # ATTLIST
6419     redo A;
6420     } else {
6421     ## XML5: Not defined yet.
6422     $self->{ca}->{name} .= chr $self->{nc};
6423     ## Stay in the state.
6424    
6425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6426     $self->{line_prev} = $self->{line};
6427     $self->{column_prev} = $self->{column};
6428     $self->{column}++;
6429     $self->{nc}
6430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6431     } else {
6432     $self->{set_nc}->($self);
6433     }
6434    
6435 wakaba 1.14 redo A;
6436     }
6437 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6438     if ($is_space->{$self->{nc}}) {
6439     ## Stay in the state.
6440    
6441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6442     $self->{line_prev} = $self->{line};
6443     $self->{column_prev} = $self->{column};
6444     $self->{column}++;
6445     $self->{nc}
6446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6447     } else {
6448     $self->{set_nc}->($self);
6449     }
6450    
6451     redo A;
6452     } elsif ($self->{nc} == 0x003E) { # >
6453     ## XML5: Same as "anything else".
6454     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6455     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6456    
6457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6458     $self->{line_prev} = $self->{line};
6459     $self->{column_prev} = $self->{column};
6460     $self->{column}++;
6461     $self->{nc}
6462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6463     } else {
6464     $self->{set_nc}->($self);
6465     }
6466    
6467     return ($self->{ct}); # ATTLIST
6468     redo A;
6469     } elsif ($self->{nc} == 0x0028) { # (
6470     ## XML5: Same as "anything else".
6471     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6472    
6473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6474     $self->{line_prev} = $self->{line};
6475     $self->{column_prev} = $self->{column};
6476     $self->{column}++;
6477     $self->{nc}
6478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6479     } else {
6480     $self->{set_nc}->($self);
6481     }
6482    
6483     redo A;
6484     } elsif ($self->{nc} == -1) {
6485     ## XML5: No parse error.
6486     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6487     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6488    
6489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6490     $self->{line_prev} = $self->{line};
6491     $self->{column_prev} = $self->{column};
6492     $self->{column}++;
6493     $self->{nc}
6494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6495     } else {
6496     $self->{set_nc}->($self);
6497     }
6498    
6499     return ($self->{ct});
6500     redo A;
6501     } else {
6502     ## XML5: Not defined yet.
6503     $self->{ca}->{type} = chr $self->{nc};
6504     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6505    
6506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6507     $self->{line_prev} = $self->{line};
6508     $self->{column_prev} = $self->{column};
6509     $self->{column}++;
6510     $self->{nc}
6511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6512     } else {
6513     $self->{set_nc}->($self);
6514     }
6515    
6516     redo A;
6517     }
6518     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6519     if ($is_space->{$self->{nc}}) {
6520     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6521    
6522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523     $self->{line_prev} = $self->{line};
6524     $self->{column_prev} = $self->{column};
6525     $self->{column}++;
6526     $self->{nc}
6527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528     } else {
6529     $self->{set_nc}->($self);
6530     }
6531    
6532     redo A;
6533     } elsif ($self->{nc} == 0x0023) { # #
6534     ## XML5: Same as "anything else".
6535     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6536     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6537    
6538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539     $self->{line_prev} = $self->{line};
6540     $self->{column_prev} = $self->{column};
6541     $self->{column}++;
6542     $self->{nc}
6543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544     } else {
6545     $self->{set_nc}->($self);
6546     }
6547    
6548     redo A;
6549     } elsif ($self->{nc} == 0x0022) { # "
6550     ## XML5: Same as "anything else".
6551     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6552     $self->{ca}->{value} = '';
6553     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6554    
6555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6556     $self->{line_prev} = $self->{line};
6557     $self->{column_prev} = $self->{column};
6558     $self->{column}++;
6559     $self->{nc}
6560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6561     } else {
6562     $self->{set_nc}->($self);
6563     }
6564    
6565     redo A;
6566     } elsif ($self->{nc} == 0x0027) { # '
6567     ## XML5: Same as "anything else".
6568     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6569     $self->{ca}->{value} = '';
6570     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6571    
6572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6573     $self->{line_prev} = $self->{line};
6574     $self->{column_prev} = $self->{column};
6575     $self->{column}++;
6576     $self->{nc}
6577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6578     } else {
6579     $self->{set_nc}->($self);
6580     }
6581    
6582     redo A;
6583     } elsif ($self->{nc} == 0x003E) { # >
6584     ## XML5: Same as "anything else".
6585     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6586     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6587    
6588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6589     $self->{line_prev} = $self->{line};
6590     $self->{column_prev} = $self->{column};
6591     $self->{column}++;
6592     $self->{nc}
6593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6594     } else {
6595     $self->{set_nc}->($self);
6596     }
6597    
6598     return ($self->{ct}); # ATTLIST
6599     redo A;
6600     } elsif ($self->{nc} == 0x0028) { # (
6601     ## XML5: Same as "anything else".
6602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6603     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6604    
6605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6606     $self->{line_prev} = $self->{line};
6607     $self->{column_prev} = $self->{column};
6608     $self->{column}++;
6609     $self->{nc}
6610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6611     } else {
6612     $self->{set_nc}->($self);
6613     }
6614    
6615     redo A;
6616     } elsif ($self->{nc} == -1) {
6617     ## XML5: No parse error.
6618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6620    
6621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6622     $self->{line_prev} = $self->{line};
6623     $self->{column_prev} = $self->{column};
6624     $self->{column}++;
6625     $self->{nc}
6626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6627     } else {
6628     $self->{set_nc}->($self);
6629     }
6630    
6631     return ($self->{ct});
6632     redo A;
6633     } else {
6634     ## XML5: Not defined yet.
6635     $self->{ca}->{type} .= chr $self->{nc};
6636     ## Stay in the state.
6637    
6638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6639     $self->{line_prev} = $self->{line};
6640     $self->{column_prev} = $self->{column};
6641     $self->{column}++;
6642     $self->{nc}
6643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6644     } else {
6645     $self->{set_nc}->($self);
6646     }
6647    
6648     redo A;
6649     }
6650     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6651     if ($is_space->{$self->{nc}}) {
6652     ## Stay in the state.
6653    
6654     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6655     $self->{line_prev} = $self->{line};
6656     $self->{column_prev} = $self->{column};
6657     $self->{column}++;
6658     $self->{nc}
6659     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6660     } else {
6661     $self->{set_nc}->($self);
6662     }
6663    
6664     redo A;
6665     } elsif ($self->{nc} == 0x0028) { # (
6666     ## XML5: Same as "anything else".
6667     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6668    
6669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6670     $self->{line_prev} = $self->{line};
6671     $self->{column_prev} = $self->{column};
6672     $self->{column}++;
6673     $self->{nc}
6674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6675     } else {
6676     $self->{set_nc}->($self);
6677     }
6678    
6679     redo A;
6680     } elsif ($self->{nc} == 0x0023) { # #
6681     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6682    
6683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6684     $self->{line_prev} = $self->{line};
6685     $self->{column_prev} = $self->{column};
6686     $self->{column}++;
6687     $self->{nc}
6688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6689     } else {
6690     $self->{set_nc}->($self);
6691     }
6692    
6693     redo A;
6694     } elsif ($self->{nc} == 0x0022) { # "
6695     ## XML5: Same as "anything else".
6696     $self->{ca}->{value} = '';
6697     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6698    
6699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6700     $self->{line_prev} = $self->{line};
6701     $self->{column_prev} = $self->{column};
6702     $self->{column}++;
6703     $self->{nc}
6704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6705     } else {
6706     $self->{set_nc}->($self);
6707     }
6708    
6709     redo A;
6710     } elsif ($self->{nc} == 0x0027) { # '
6711     ## XML5: Same as "anything else".
6712     $self->{ca}->{value} = '';
6713     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6714    
6715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716     $self->{line_prev} = $self->{line};
6717     $self->{column_prev} = $self->{column};
6718     $self->{column}++;
6719     $self->{nc}
6720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721     } else {
6722     $self->{set_nc}->($self);
6723     }
6724    
6725     redo A;
6726     } elsif ($self->{nc} == 0x003E) { # >
6727     ## XML5: Same as "anything else".
6728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6730    
6731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732     $self->{line_prev} = $self->{line};
6733     $self->{column_prev} = $self->{column};
6734     $self->{column}++;
6735     $self->{nc}
6736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737     } else {
6738     $self->{set_nc}->($self);
6739     }
6740    
6741     return ($self->{ct}); # ATTLIST
6742     redo A;
6743     } elsif ($self->{nc} == -1) {
6744     ## XML5: No parse error.
6745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6746     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6747    
6748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6749     $self->{line_prev} = $self->{line};
6750     $self->{column_prev} = $self->{column};
6751     $self->{column}++;
6752     $self->{nc}
6753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6754     } else {
6755     $self->{set_nc}->($self);
6756     }
6757    
6758     return ($self->{ct});
6759     redo A;
6760     } else {
6761     ## XML5: Switch to the "DOCTYPE bogus comment state".
6762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6763     $self->{ca}->{value} = '';
6764     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6765     ## Reconsume.
6766     redo A;
6767     }
6768     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6769     if ($is_space->{$self->{nc}}) {
6770     ## Stay in the state.
6771    
6772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773     $self->{line_prev} = $self->{line};
6774     $self->{column_prev} = $self->{column};
6775     $self->{column}++;
6776     $self->{nc}
6777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778     } else {
6779     $self->{set_nc}->($self);
6780     }
6781    
6782     redo A;
6783     } elsif ($self->{nc} == 0x007C) { # |
6784     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6785     ## Stay in the state.
6786    
6787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788     $self->{line_prev} = $self->{line};
6789     $self->{column_prev} = $self->{column};
6790     $self->{column}++;
6791     $self->{nc}
6792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793     } else {
6794     $self->{set_nc}->($self);
6795     }
6796    
6797     redo A;
6798     } elsif ($self->{nc} == 0x0029) { # )
6799     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6800     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6801    
6802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6803     $self->{line_prev} = $self->{line};
6804     $self->{column_prev} = $self->{column};
6805     $self->{column}++;
6806     $self->{nc}
6807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6808     } else {
6809     $self->{set_nc}->($self);
6810     }
6811    
6812     redo A;
6813     } elsif ($self->{nc} == 0x003E) { # >
6814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6815     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6816    
6817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6818     $self->{line_prev} = $self->{line};
6819     $self->{column_prev} = $self->{column};
6820     $self->{column}++;
6821     $self->{nc}
6822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6823     } else {
6824     $self->{set_nc}->($self);
6825     }
6826    
6827     return ($self->{ct}); # ATTLIST
6828     redo A;
6829     } elsif ($self->{nc} == -1) {
6830     ## XML5: No parse error.
6831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6833    
6834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835     $self->{line_prev} = $self->{line};
6836     $self->{column_prev} = $self->{column};
6837     $self->{column}++;
6838     $self->{nc}
6839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840     } else {
6841     $self->{set_nc}->($self);
6842     }
6843    
6844     return ($self->{ct});
6845     redo A;
6846     } else {
6847     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6848     $self->{state} = ALLOWED_TOKEN_STATE;
6849    
6850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6851     $self->{line_prev} = $self->{line};
6852     $self->{column_prev} = $self->{column};
6853     $self->{column}++;
6854     $self->{nc}
6855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6856     } else {
6857     $self->{set_nc}->($self);
6858     }
6859    
6860     redo A;
6861     }
6862     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6863     if ($is_space->{$self->{nc}}) {
6864     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6865    
6866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6867     $self->{line_prev} = $self->{line};
6868     $self->{column_prev} = $self->{column};
6869     $self->{column}++;
6870     $self->{nc}
6871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6872     } else {
6873     $self->{set_nc}->($self);
6874     }
6875    
6876     redo A;
6877     } elsif ($self->{nc} == 0x007C) { # |
6878     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6879    
6880     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6881     $self->{line_prev} = $self->{line};
6882     $self->{column_prev} = $self->{column};
6883     $self->{column}++;
6884     $self->{nc}
6885     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6886     } else {
6887     $self->{set_nc}->($self);
6888     }
6889    
6890     redo A;
6891     } elsif ($self->{nc} == 0x0029) { # )
6892     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6893    
6894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6895     $self->{line_prev} = $self->{line};
6896     $self->{column_prev} = $self->{column};
6897     $self->{column}++;
6898     $self->{nc}
6899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6900     } else {
6901     $self->{set_nc}->($self);
6902     }
6903    
6904     redo A;
6905     } elsif ($self->{nc} == 0x003E) { # >
6906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6907     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6908    
6909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6910     $self->{line_prev} = $self->{line};
6911     $self->{column_prev} = $self->{column};
6912     $self->{column}++;
6913     $self->{nc}
6914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6915     } else {
6916     $self->{set_nc}->($self);
6917     }
6918    
6919     return ($self->{ct}); # ATTLIST
6920     redo A;
6921     } elsif ($self->{nc} == -1) {
6922     ## XML5: No parse error.
6923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6924     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6925    
6926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6927     $self->{line_prev} = $self->{line};
6928     $self->{column_prev} = $self->{column};
6929     $self->{column}++;
6930     $self->{nc}
6931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6932     } else {
6933     $self->{set_nc}->($self);
6934     }
6935    
6936     return ($self->{ct});
6937     redo A;
6938     } else {
6939     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6940     ## Stay in the state.
6941    
6942     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6943     $self->{line_prev} = $self->{line};
6944     $self->{column_prev} = $self->{column};
6945     $self->{column}++;
6946     $self->{nc}
6947     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6948     } else {
6949     $self->{set_nc}->($self);
6950     }
6951    
6952     redo A;
6953     }
6954     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6955     if ($is_space->{$self->{nc}}) {
6956     ## Stay in the state.
6957    
6958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6959     $self->{line_prev} = $self->{line};
6960     $self->{column_prev} = $self->{column};
6961     $self->{column}++;
6962     $self->{nc}
6963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6964     } else {
6965     $self->{set_nc}->($self);
6966     }
6967    
6968     redo A;
6969     } elsif ($self->{nc} == 0x007C) { # |
6970     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6971    
6972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6973     $self->{line_prev} = $self->{line};
6974     $self->{column_prev} = $self->{column};
6975     $self->{column}++;
6976     $self->{nc}
6977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6978     } else {
6979     $self->{set_nc}->($self);
6980     }
6981    
6982     redo A;
6983     } elsif ($self->{nc} == 0x0029) { # )
6984     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6985    
6986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6987     $self->{line_prev} = $self->{line};
6988     $self->{column_prev} = $self->{column};
6989     $self->{column}++;
6990     $self->{nc}
6991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6992     } else {
6993     $self->{set_nc}->($self);
6994     }
6995    
6996     redo A;
6997     } elsif ($self->{nc} == 0x003E) { # >
6998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6999     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7000    
7001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002     $self->{line_prev} = $self->{line};
7003     $self->{column_prev} = $self->{column};
7004     $self->{column}++;
7005     $self->{nc}
7006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007     } else {
7008     $self->{set_nc}->($self);
7009     }
7010    
7011     return ($self->{ct}); # ATTLIST
7012     redo A;
7013     } elsif ($self->{nc} == -1) {
7014     ## XML5: No parse error.
7015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7016     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7017    
7018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7019     $self->{line_prev} = $self->{line};
7020     $self->{column_prev} = $self->{column};
7021     $self->{column}++;
7022     $self->{nc}
7023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7024     } else {
7025     $self->{set_nc}->($self);
7026     }
7027    
7028     return ($self->{ct});
7029     redo A;
7030     } else {
7031     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7032     line => $self->{line_prev},
7033     column => $self->{column_prev});
7034     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7035     $self->{state} = ALLOWED_TOKEN_STATE;
7036    
7037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7038     $self->{line_prev} = $self->{line};
7039     $self->{column_prev} = $self->{column};
7040     $self->{column}++;
7041     $self->{nc}
7042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7043     } else {
7044     $self->{set_nc}->($self);
7045     }
7046    
7047     redo A;
7048     }
7049     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7050     if ($is_space->{$self->{nc}}) {
7051     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7052    
7053     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7054     $self->{line_prev} = $self->{line};
7055     $self->{column_prev} = $self->{column};
7056     $self->{column}++;
7057     $self->{nc}
7058     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7059     } else {
7060     $self->{set_nc}->($self);
7061     }
7062    
7063     redo A;
7064     } elsif ($self->{nc} == 0x0023) { # #
7065     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7066     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7067    
7068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069     $self->{line_prev} = $self->{line};
7070     $self->{column_prev} = $self->{column};
7071     $self->{column}++;
7072     $self->{nc}
7073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074     } else {
7075     $self->{set_nc}->($self);
7076     }
7077    
7078     redo A;
7079     } elsif ($self->{nc} == 0x0022) { # "
7080     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7081     $self->{ca}->{value} = '';
7082     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7083    
7084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085     $self->{line_prev} = $self->{line};
7086     $self->{column_prev} = $self->{column};
7087     $self->{column}++;
7088     $self->{nc}
7089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090     } else {
7091     $self->{set_nc}->($self);
7092     }
7093    
7094     redo A;
7095     } elsif ($self->{nc} == 0x0027) { # '
7096     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7097     $self->{ca}->{value} = '';
7098     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7099    
7100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101     $self->{line_prev} = $self->{line};
7102     $self->{column_prev} = $self->{column};
7103     $self->{column}++;
7104     $self->{nc}
7105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106     } else {
7107     $self->{set_nc}->($self);
7108     }
7109    
7110     redo A;
7111     } elsif ($self->{nc} == 0x003E) { # >
7112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7113     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7114    
7115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116     $self->{line_prev} = $self->{line};
7117     $self->{column_prev} = $self->{column};
7118     $self->{column}++;
7119     $self->{nc}
7120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121     } else {
7122     $self->{set_nc}->($self);
7123     }
7124    
7125     return ($self->{ct}); # ATTLIST
7126     redo A;
7127     } elsif ($self->{nc} == -1) {
7128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7129     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7130    
7131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7132     $self->{line_prev} = $self->{line};
7133     $self->{column_prev} = $self->{column};
7134     $self->{column}++;
7135     $self->{nc}
7136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7137     } else {
7138     $self->{set_nc}->($self);
7139     }
7140    
7141     return ($self->{ct});
7142     redo A;
7143     } else {
7144     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7145     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7146     ## Reconsume.
7147     redo A;
7148     }
7149     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7150     if ($is_space->{$self->{nc}}) {
7151     ## Stay in the state.
7152    
7153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7154     $self->{line_prev} = $self->{line};
7155     $self->{column_prev} = $self->{column};
7156     $self->{column}++;
7157     $self->{nc}
7158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7159     } else {
7160     $self->{set_nc}->($self);
7161     }
7162    
7163     redo A;
7164     } elsif ($self->{nc} == 0x0023) { # #
7165     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7166    
7167     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7168     $self->{line_prev} = $self->{line};
7169     $self->{column_prev} = $self->{column};
7170     $self->{column}++;
7171     $self->{nc}
7172     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7173     } else {
7174     $self->{set_nc}->($self);
7175     }
7176    
7177     redo A;
7178     } elsif ($self->{nc} == 0x0022) { # "
7179     $self->{ca}->{value} = '';
7180     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7181    
7182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7183     $self->{line_prev} = $self->{line};
7184     $self->{column_prev} = $self->{column};
7185     $self->{column}++;
7186     $self->{nc}
7187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7188     } else {
7189     $self->{set_nc}->($self);
7190     }
7191    
7192     redo A;
7193     } elsif ($self->{nc} == 0x0027) { # '
7194     $self->{ca}->{value} = '';
7195     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7196    
7197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7198     $self->{line_prev} = $self->{line};
7199     $self->{column_prev} = $self->{column};
7200     $self->{column}++;
7201     $self->{nc}
7202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7203     } else {
7204     $self->{set_nc}->($self);
7205     }
7206    
7207     redo A;
7208     } elsif ($self->{nc} == 0x003E) { # >
7209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7210     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7211    
7212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213     $self->{line_prev} = $self->{line};
7214     $self->{column_prev} = $self->{column};
7215     $self->{column}++;
7216     $self->{nc}
7217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218     } else {
7219     $self->{set_nc}->($self);
7220     }
7221    
7222     return ($self->{ct}); # ATTLIST
7223     redo A;
7224     } elsif ($self->{nc} == -1) {
7225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7227    
7228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229     $self->{line_prev} = $self->{line};
7230     $self->{column_prev} = $self->{column};
7231     $self->{column}++;
7232     $self->{nc}
7233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234     } else {
7235     $self->{set_nc}->($self);
7236     }
7237    
7238     return ($self->{ct});
7239     redo A;
7240     } else {
7241     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7242     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7243     ## Reconsume.
7244     redo A;
7245     }
7246     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7247     if ($is_space->{$self->{nc}}) {
7248     ## XML5: No parse error.
7249     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7250 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7251 wakaba 1.15 ## Reconsume.
7252     redo A;
7253     } elsif ($self->{nc} == 0x0022) { # "
7254     ## XML5: Same as "anything else".
7255     $self->{ca}->{value} = '';
7256     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7257    
7258     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7259     $self->{line_prev} = $self->{line};
7260     $self->{column_prev} = $self->{column};
7261     $self->{column}++;
7262     $self->{nc}
7263     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7264     } else {
7265     $self->{set_nc}->($self);
7266     }
7267    
7268     redo A;
7269     } elsif ($self->{nc} == 0x0027) { # '
7270     ## XML5: Same as "anything else".
7271     $self->{ca}->{value} = '';
7272     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7273    
7274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7275     $self->{line_prev} = $self->{line};
7276     $self->{column_prev} = $self->{column};
7277     $self->{column}++;
7278     $self->{nc}
7279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7280     } else {
7281     $self->{set_nc}->($self);
7282     }
7283    
7284     redo A;
7285     } elsif ($self->{nc} == 0x003E) { # >
7286     ## XML5: Same as "anything else".
7287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7289    
7290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7291     $self->{line_prev} = $self->{line};
7292     $self->{column_prev} = $self->{column};
7293     $self->{column}++;
7294     $self->{nc}
7295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7296     } else {
7297     $self->{set_nc}->($self);
7298     }
7299    
7300     return ($self->{ct}); # ATTLIST
7301     redo A;
7302     } elsif ($self->{nc} == -1) {
7303     ## XML5: No parse error.
7304     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7305     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7306    
7307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308     $self->{line_prev} = $self->{line};
7309     $self->{column_prev} = $self->{column};
7310     $self->{column}++;
7311     $self->{nc}
7312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313     } else {
7314     $self->{set_nc}->($self);
7315     }
7316    
7317     return ($self->{ct});
7318     redo A;
7319     } else {
7320     $self->{ca}->{default} = chr $self->{nc};
7321     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7322    
7323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7324     $self->{line_prev} = $self->{line};
7325     $self->{column_prev} = $self->{column};
7326     $self->{column}++;
7327     $self->{nc}
7328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7329     } else {
7330     $self->{set_nc}->($self);
7331     }
7332    
7333     redo A;
7334     }
7335     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7336     if ($is_space->{$self->{nc}}) {
7337     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7338    
7339     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7340     $self->{line_prev} = $self->{line};
7341     $self->{column_prev} = $self->{column};
7342     $self->{column}++;
7343     $self->{nc}
7344     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7345     } else {
7346     $self->{set_nc}->($self);
7347     }
7348    
7349     redo A;
7350     } elsif ($self->{nc} == 0x0022) { # "
7351     ## XML5: Same as "anything else".
7352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7353     $self->{ca}->{value} = '';
7354     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7355    
7356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7357     $self->{line_prev} = $self->{line};
7358     $self->{column_prev} = $self->{column};
7359     $self->{column}++;
7360     $self->{nc}
7361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7362     } else {
7363     $self->{set_nc}->($self);
7364     }
7365    
7366     redo A;
7367     } elsif ($self->{nc} == 0x0027) { # '
7368     ## XML5: Same as "anything else".
7369     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7370     $self->{ca}->{value} = '';
7371     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7372    
7373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7374     $self->{line_prev} = $self->{line};
7375     $self->{column_prev} = $self->{column};
7376     $self->{column}++;
7377     $self->{nc}
7378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7379     } else {
7380     $self->{set_nc}->($self);
7381     }
7382    
7383     redo A;
7384     } elsif ($self->{nc} == 0x003E) { # >
7385     ## XML5: Same as "anything else".
7386     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7387     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7388    
7389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7390     $self->{line_prev} = $self->{line};
7391     $self->{column_prev} = $self->{column};
7392     $self->{column}++;
7393     $self->{nc}
7394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7395     } else {
7396     $self->{set_nc}->($self);
7397     }
7398    
7399     return ($self->{ct}); # ATTLIST
7400     redo A;
7401     } elsif ($self->{nc} == -1) {
7402     ## XML5: No parse error.
7403     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7404     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7406    
7407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7408     $self->{line_prev} = $self->{line};
7409     $self->{column_prev} = $self->{column};
7410     $self->{column}++;
7411     $self->{nc}
7412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7413     } else {
7414     $self->{set_nc}->($self);
7415     }
7416    
7417     return ($self->{ct});
7418     redo A;
7419     } else {
7420     $self->{ca}->{default} .= chr $self->{nc};
7421     ## Stay in the state.
7422    
7423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7424     $self->{line_prev} = $self->{line};
7425     $self->{column_prev} = $self->{column};
7426     $self->{column}++;
7427     $self->{nc}
7428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7429     } else {
7430     $self->{set_nc}->($self);
7431     }
7432    
7433     redo A;
7434     }
7435     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7436     if ($is_space->{$self->{nc}}) {
7437     ## Stay in the state.
7438    
7439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7440     $self->{line_prev} = $self->{line};
7441     $self->{column_prev} = $self->{column};
7442     $self->{column}++;
7443     $self->{nc}
7444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7445     } else {
7446     $self->{set_nc}->($self);
7447     }
7448    
7449     redo A;
7450     } elsif ($self->{nc} == 0x0022) { # "
7451     $self->{ca}->{value} = '';
7452     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7453    
7454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7455     $self->{line_prev} = $self->{line};
7456     $self->{column_prev} = $self->{column};
7457     $self->{column}++;
7458     $self->{nc}
7459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7460     } else {
7461     $self->{set_nc}->($self);
7462     }
7463    
7464     redo A;
7465     } elsif ($self->{nc} == 0x0027) { # '
7466     $self->{ca}->{value} = '';
7467     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7468    
7469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470     $self->{line_prev} = $self->{line};
7471     $self->{column_prev} = $self->{column};
7472     $self->{column}++;
7473     $self->{nc}
7474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475     } else {
7476     $self->{set_nc}->($self);
7477     }
7478    
7479     redo A;
7480     } elsif ($self->{nc} == 0x003E) { # >
7481     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7482     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7483    
7484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485     $self->{line_prev} = $self->{line};
7486     $self->{column_prev} = $self->{column};
7487     $self->{column}++;
7488     $self->{nc}
7489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490     } else {
7491     $self->{set_nc}->($self);
7492     }
7493    
7494     return ($self->{ct}); # ATTLIST
7495     redo A;
7496     } elsif ($self->{nc} == -1) {
7497     ## XML5: No parse error.
7498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7499     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7500     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7501    
7502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7503     $self->{line_prev} = $self->{line};
7504     $self->{column_prev} = $self->{column};
7505     $self->{column}++;
7506     $self->{nc}
7507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7508     } else {
7509     $self->{set_nc}->($self);
7510     }
7511    
7512     return ($self->{ct});
7513     redo A;
7514     } else {
7515     ## XML5: Not defined yet.
7516     if ($self->{ca}->{default} eq 'FIXED') {
7517     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7518     } else {
7519     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7520     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7521     }
7522     ## Reconsume.
7523     redo A;
7524     }
7525     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7526     if ($is_space->{$self->{nc}} or
7527     $self->{nc} == -1 or
7528     $self->{nc} == 0x003E) { # >
7529     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7530     ## Reconsume.
7531     redo A;
7532     } else {
7533     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7534     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7535     ## Reconsume.
7536     redo A;
7537 wakaba 1.16 }
7538 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7539     ## ASCII case-insensitive
7540     if ($self->{nc} == [
7541     undef,
7542     0x0044, # D
7543     0x0041, # A
7544     0x0054, # T
7545     ]->[length $self->{kwd}] or
7546     $self->{nc} == [
7547     undef,
7548     0x0064, # d
7549     0x0061, # a
7550     0x0074, # t
7551     ]->[length $self->{kwd}]) {
7552    
7553     ## Stay in the state.
7554     $self->{kwd} .= chr $self->{nc};
7555    
7556     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7557     $self->{line_prev} = $self->{line};
7558     $self->{column_prev} = $self->{column};
7559     $self->{column}++;
7560     $self->{nc}
7561     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7562     } else {
7563     $self->{set_nc}->($self);
7564     }
7565    
7566     redo A;
7567     } elsif ((length $self->{kwd}) == 4 and
7568     ($self->{nc} == 0x0041 or # A
7569     $self->{nc} == 0x0061)) { # a
7570     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7571    
7572     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7573     text => 'NDATA',
7574     line => $self->{line_prev},
7575     column => $self->{column_prev} - 4);
7576     } else {
7577    
7578     }
7579     $self->{state} = AFTER_NDATA_STATE;
7580    
7581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7582     $self->{line_prev} = $self->{line};
7583     $self->{column_prev} = $self->{column};
7584     $self->{column}++;
7585     $self->{nc}
7586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7587     } else {
7588     $self->{set_nc}->($self);
7589     }
7590    
7591     redo A;
7592     } else {
7593     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7594     line => $self->{line_prev},
7595     column => $self->{column_prev} + 1
7596     - length $self->{kwd});
7597    
7598     $self->{state} = BOGUS_MD_STATE;
7599     ## Reconsume.
7600     redo A;
7601     }
7602     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7603     if ($is_space->{$self->{nc}}) {
7604     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7605    
7606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7607     $self->{line_prev} = $self->{line};
7608     $self->{column_prev} = $self->{column};
7609     $self->{column}++;
7610     $self->{nc}
7611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7612     } else {
7613     $self->{set_nc}->($self);
7614     }
7615    
7616     redo A;
7617     } elsif ($self->{nc} == 0x003E) { # >
7618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7619     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7620    
7621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7622     $self->{line_prev} = $self->{line};
7623     $self->{column_prev} = $self->{column};
7624     $self->{column}++;
7625     $self->{nc}
7626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7627     } else {
7628     $self->{set_nc}->($self);
7629     }
7630    
7631     return ($self->{ct}); # ENTITY
7632     redo A;
7633     } elsif ($self->{nc} == -1) {
7634     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7635     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7636    
7637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7638     $self->{line_prev} = $self->{line};
7639     $self->{column_prev} = $self->{column};
7640     $self->{column}++;
7641     $self->{nc}
7642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7643     } else {
7644     $self->{set_nc}->($self);
7645     }
7646    
7647     return ($self->{ct}); # ENTITY
7648     redo A;
7649     } else {
7650     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7651     line => $self->{line_prev},
7652     column => $self->{column_prev} + 1
7653     - length $self->{kwd});
7654     $self->{state} = BOGUS_MD_STATE;
7655     ## Reconsume.
7656     redo A;
7657     }
7658     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7659     if ($is_space->{$self->{nc}}) {
7660     ## Stay in the state.
7661    
7662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7663     $self->{line_prev} = $self->{line};
7664     $self->{column_prev} = $self->{column};
7665     $self->{column}++;
7666     $self->{nc}
7667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7668     } else {
7669     $self->{set_nc}->($self);
7670     }
7671    
7672     redo A;
7673     } elsif ($self->{nc} == 0x003E) { # >
7674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7675     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7676    
7677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678     $self->{line_prev} = $self->{line};
7679     $self->{column_prev} = $self->{column};
7680     $self->{column}++;
7681     $self->{nc}
7682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683     } else {
7684     $self->{set_nc}->($self);
7685     }
7686    
7687     return ($self->{ct}); # ENTITY
7688     redo A;
7689     } elsif ($self->{nc} == -1) {
7690     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7691     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7692    
7693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7694     $self->{line_prev} = $self->{line};
7695     $self->{column_prev} = $self->{column};
7696     $self->{column}++;
7697     $self->{nc}
7698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7699     } else {
7700     $self->{set_nc}->($self);
7701     }
7702    
7703     return ($self->{ct}); # ENTITY
7704     redo A;
7705     } else {
7706     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7707     $self->{state} = NOTATION_NAME_STATE;
7708    
7709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710     $self->{line_prev} = $self->{line};
7711     $self->{column_prev} = $self->{column};
7712     $self->{column}++;
7713     $self->{nc}
7714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715     } else {
7716     $self->{set_nc}->($self);
7717     }
7718    
7719     redo A;
7720     }
7721     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7722     if ($is_space->{$self->{nc}}) {
7723 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7724 wakaba 1.18
7725     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7726     $self->{line_prev} = $self->{line};
7727     $self->{column_prev} = $self->{column};
7728     $self->{column}++;
7729     $self->{nc}
7730     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7731     } else {
7732     $self->{set_nc}->($self);
7733     }
7734    
7735     redo A;
7736     } elsif ($self->{nc} == 0x003E) { # >
7737     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7738    
7739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740     $self->{line_prev} = $self->{line};
7741     $self->{column_prev} = $self->{column};
7742     $self->{column}++;
7743     $self->{nc}
7744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745     } else {
7746     $self->{set_nc}->($self);
7747     }
7748    
7749     return ($self->{ct}); # ENTITY
7750     redo A;
7751     } elsif ($self->{nc} == -1) {
7752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7753     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7754    
7755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7756     $self->{line_prev} = $self->{line};
7757     $self->{column_prev} = $self->{column};
7758     $self->{column}++;
7759     $self->{nc}
7760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7761     } else {
7762     $self->{set_nc}->($self);
7763     }
7764    
7765     return ($self->{ct}); # ENTITY
7766     redo A;
7767     } else {
7768     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7769     ## Stay in the state.
7770    
7771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772     $self->{line_prev} = $self->{line};
7773     $self->{column_prev} = $self->{column};
7774     $self->{column}++;
7775     $self->{nc}
7776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777     } else {
7778     $self->{set_nc}->($self);
7779     }
7780    
7781     redo A;
7782     }
7783 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7784     if ($self->{nc} == 0x0022) { # "
7785 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7786 wakaba 1.19
7787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788     $self->{line_prev} = $self->{line};
7789     $self->{column_prev} = $self->{column};
7790     $self->{column}++;
7791     $self->{nc}
7792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793     } else {
7794     $self->{set_nc}->($self);
7795     }
7796    
7797     redo A;
7798     } elsif ($self->{nc} == 0x0026) { # &
7799     $self->{prev_state} = $self->{state};
7800     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7801     $self->{entity_add} = 0x0022; # "
7802    
7803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804     $self->{line_prev} = $self->{line};
7805     $self->{column_prev} = $self->{column};
7806     $self->{column}++;
7807     $self->{nc}
7808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809     } else {
7810     $self->{set_nc}->($self);
7811     }
7812    
7813     redo A;
7814     ## TODO: %
7815     } elsif ($self->{nc} == -1) {
7816     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7817     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7818     ## Reconsume.
7819     return ($self->{ct}); # ENTITY
7820     redo A;
7821     } else {
7822     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7823    
7824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7825     $self->{line_prev} = $self->{line};
7826     $self->{column_prev} = $self->{column};
7827     $self->{column}++;
7828     $self->{nc}
7829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7830     } else {
7831     $self->{set_nc}->($self);
7832     }
7833    
7834     redo A;
7835     }
7836     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7837     if ($self->{nc} == 0x0027) { # '
7838 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7839 wakaba 1.19
7840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841     $self->{line_prev} = $self->{line};
7842     $self->{column_prev} = $self->{column};
7843     $self->{column}++;
7844     $self->{nc}
7845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846     } else {
7847     $self->{set_nc}->($self);
7848     }
7849    
7850     redo A;
7851     } elsif ($self->{nc} == 0x0026) { # &
7852     $self->{prev_state} = $self->{state};
7853     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7854     $self->{entity_add} = 0x0027; # '
7855    
7856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7857     $self->{line_prev} = $self->{line};
7858     $self->{column_prev} = $self->{column};
7859     $self->{column}++;
7860     $self->{nc}
7861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7862     } else {
7863     $self->{set_nc}->($self);
7864     }
7865    
7866     redo A;
7867     ## TODO: %
7868     } elsif ($self->{nc} == -1) {
7869     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7870     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7871     ## Reconsume.
7872     return ($self->{ct}); # ENTITY
7873     redo A;
7874     } else {
7875     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7876    
7877     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7878     $self->{line_prev} = $self->{line};
7879     $self->{column_prev} = $self->{column};
7880     $self->{column}++;
7881     $self->{nc}
7882     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7883     } else {
7884     $self->{set_nc}->($self);
7885     }
7886    
7887     redo A;
7888     }
7889     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7890     ## TODO: XMLize
7891    
7892     if ($is_space->{$self->{nc}} or
7893     {
7894     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7895     $self->{entity_add} => 1,
7896     }->{$self->{nc}}) {
7897     ## Don't consume
7898     ## No error
7899     ## Return nothing.
7900     #
7901     } elsif ($self->{nc} == 0x0023) { # #
7902     $self->{ca} = $self->{ct};
7903     $self->{state} = ENTITY_HASH_STATE;
7904     $self->{kwd} = '#';
7905    
7906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907     $self->{line_prev} = $self->{line};
7908     $self->{column_prev} = $self->{column};
7909     $self->{column}++;
7910     $self->{nc}
7911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912     } else {
7913     $self->{set_nc}->($self);
7914     }
7915    
7916     redo A;
7917     } elsif ((0x0041 <= $self->{nc} and
7918     $self->{nc} <= 0x005A) or # A..Z
7919     (0x0061 <= $self->{nc} and
7920     $self->{nc} <= 0x007A)) { # a..z
7921     #
7922     } else {
7923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7924     ## Return nothing.
7925     #
7926     }
7927    
7928     $self->{ct}->{value} .= '&';
7929     $self->{state} = $self->{prev_state};
7930     ## Reconsume.
7931     redo A;
7932 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7933     if ($is_space->{$self->{nc}}) {
7934     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7935    
7936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7937     $self->{line_prev} = $self->{line};
7938     $self->{column_prev} = $self->{column};
7939     $self->{column}++;
7940     $self->{nc}
7941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7942     } else {
7943     $self->{set_nc}->($self);
7944     }
7945    
7946     redo A;
7947     } elsif ($self->{nc} == 0x0028) { # (
7948     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7949     $self->{ct}->{content} = ['('];
7950     $self->{group_depth} = 1;
7951    
7952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7953     $self->{line_prev} = $self->{line};
7954     $self->{column_prev} = $self->{column};
7955     $self->{column}++;
7956     $self->{nc}
7957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7958     } else {
7959     $self->{set_nc}->($self);
7960     }
7961    
7962     redo A;
7963     } elsif ($self->{nc} == 0x003E) { # >
7964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7965     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7966    
7967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7968     $self->{line_prev} = $self->{line};
7969     $self->{column_prev} = $self->{column};
7970     $self->{column}++;
7971     $self->{nc}
7972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7973     } else {
7974     $self->{set_nc}->($self);
7975     }
7976    
7977     return ($self->{ct}); # ELEMENT
7978     redo A;
7979     } elsif ($self->{nc} == -1) {
7980     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7981     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7982    
7983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7984     $self->{line_prev} = $self->{line};
7985     $self->{column_prev} = $self->{column};
7986     $self->{column}++;
7987     $self->{nc}
7988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7989     } else {
7990     $self->{set_nc}->($self);
7991     }
7992    
7993     return ($self->{ct}); # ELEMENT
7994     redo A;
7995     } else {
7996     $self->{ct}->{content} = [chr $self->{nc}];
7997     $self->{state} = CONTENT_KEYWORD_STATE;
7998    
7999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8000     $self->{line_prev} = $self->{line};
8001     $self->{column_prev} = $self->{column};
8002     $self->{column}++;
8003     $self->{nc}
8004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8005     } else {
8006     $self->{set_nc}->($self);
8007     }
8008    
8009     redo A;
8010     }
8011     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8012     if ($is_space->{$self->{nc}}) {
8013     $self->{state} = AFTER_MD_DEF_STATE;
8014    
8015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8016     $self->{line_prev} = $self->{line};
8017     $self->{column_prev} = $self->{column};
8018     $self->{column}++;
8019     $self->{nc}
8020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8021     } else {
8022     $self->{set_nc}->($self);
8023     }
8024    
8025     redo A;
8026     } elsif ($self->{nc} == 0x003E) { # >
8027     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8028    
8029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8030     $self->{line_prev} = $self->{line};
8031     $self->{column_prev} = $self->{column};
8032     $self->{column}++;
8033     $self->{nc}
8034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8035     } else {
8036     $self->{set_nc}->($self);
8037     }
8038    
8039     return ($self->{ct}); # ELEMENT
8040     redo A;
8041     } elsif ($self->{nc} == -1) {
8042     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8043     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8044    
8045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8046     $self->{line_prev} = $self->{line};
8047     $self->{column_prev} = $self->{column};
8048     $self->{column}++;
8049     $self->{nc}
8050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8051     } else {
8052     $self->{set_nc}->($self);
8053     }
8054    
8055     return ($self->{ct}); # ELEMENT
8056     redo A;
8057     } else {
8058     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8059     ## Stay in the state.
8060    
8061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8062     $self->{line_prev} = $self->{line};
8063     $self->{column_prev} = $self->{column};
8064     $self->{column}++;
8065     $self->{nc}
8066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8067     } else {
8068     $self->{set_nc}->($self);
8069     }
8070    
8071     redo A;
8072     }
8073     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8074     if ($is_space->{$self->{nc}}) {
8075     ## Stay in the state.
8076    
8077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8078     $self->{line_prev} = $self->{line};
8079     $self->{column_prev} = $self->{column};
8080     $self->{column}++;
8081     $self->{nc}
8082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8083     } else {
8084     $self->{set_nc}->($self);
8085     }
8086    
8087     redo A;
8088     } elsif ($self->{nc} == 0x0028) { # (
8089     $self->{group_depth}++;
8090     push @{$self->{ct}->{content}}, chr $self->{nc};
8091     ## Stay in the state.
8092    
8093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094     $self->{line_prev} = $self->{line};
8095     $self->{column_prev} = $self->{column};
8096     $self->{column}++;
8097     $self->{nc}
8098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099     } else {
8100     $self->{set_nc}->($self);
8101     }
8102    
8103     redo A;
8104     } elsif ($self->{nc} == 0x007C or # |
8105     $self->{nc} == 0x002C) { # ,
8106     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8107     ## Stay in the state.
8108    
8109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110     $self->{line_prev} = $self->{line};
8111     $self->{column_prev} = $self->{column};
8112     $self->{column}++;
8113     $self->{nc}
8114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115     } else {
8116     $self->{set_nc}->($self);
8117     }
8118    
8119     redo A;
8120     } elsif ($self->{nc} == 0x0029) { # )
8121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8122     push @{$self->{ct}->{content}}, chr $self->{nc};
8123     $self->{group_depth}--;
8124     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8125    
8126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127     $self->{line_prev} = $self->{line};
8128     $self->{column_prev} = $self->{column};
8129     $self->{column}++;
8130     $self->{nc}
8131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132     } else {
8133     $self->{set_nc}->($self);
8134     }
8135    
8136     redo A;
8137     } elsif ($self->{nc} == 0x003E) { # >
8138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8139     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141    
8142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8143     $self->{line_prev} = $self->{line};
8144     $self->{column_prev} = $self->{column};
8145     $self->{column}++;
8146     $self->{nc}
8147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8148     } else {
8149     $self->{set_nc}->($self);
8150     }
8151    
8152     return ($self->{ct}); # ELEMENT
8153     redo A;
8154     } elsif ($self->{nc} == -1) {
8155     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8156     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8158    
8159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8160     $self->{line_prev} = $self->{line};
8161     $self->{column_prev} = $self->{column};
8162     $self->{column}++;
8163     $self->{nc}
8164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8165     } else {
8166     $self->{set_nc}->($self);
8167     }
8168    
8169     return ($self->{ct}); # ELEMENT
8170     redo A;
8171     } else {
8172     push @{$self->{ct}->{content}}, chr $self->{nc};
8173     $self->{state} = CM_ELEMENT_NAME_STATE;
8174    
8175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8176     $self->{line_prev} = $self->{line};
8177     $self->{column_prev} = $self->{column};
8178     $self->{column}++;
8179     $self->{nc}
8180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8181     } else {
8182     $self->{set_nc}->($self);
8183     }
8184    
8185     redo A;
8186     }
8187     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8188     if ($is_space->{$self->{nc}}) {
8189     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8190    
8191     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8192     $self->{line_prev} = $self->{line};
8193     $self->{column_prev} = $self->{column};
8194     $self->{column}++;
8195     $self->{nc}
8196     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8197     } else {
8198     $self->{set_nc}->($self);
8199     }
8200    
8201     redo A;
8202     } elsif ($self->{nc} == 0x002A or # *
8203     $self->{nc} == 0x002B or # +
8204     $self->{nc} == 0x003F) { # ?
8205     push @{$self->{ct}->{content}}, chr $self->{nc};
8206     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8207    
8208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8209     $self->{line_prev} = $self->{line};
8210     $self->{column_prev} = $self->{column};
8211     $self->{column}++;
8212     $self->{nc}
8213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8214     } else {
8215     $self->{set_nc}->($self);
8216     }
8217    
8218     redo A;
8219     } elsif ($self->{nc} == 0x007C or # |
8220     $self->{nc} == 0x002C) { # ,
8221     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8222     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8223    
8224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8225     $self->{line_prev} = $self->{line};
8226     $self->{column_prev} = $self->{column};
8227     $self->{column}++;
8228     $self->{nc}
8229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8230     } else {
8231     $self->{set_nc}->($self);
8232     }
8233    
8234     redo A;
8235     } elsif ($self->{nc} == 0x0029) { # )
8236     $self->{group_depth}--;
8237     push @{$self->{ct}->{content}}, chr $self->{nc};
8238     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8239    
8240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8241     $self->{line_prev} = $self->{line};
8242     $self->{column_prev} = $self->{column};
8243     $self->{column}++;
8244     $self->{nc}
8245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8246     } else {
8247     $self->{set_nc}->($self);
8248     }
8249    
8250     redo A;
8251     } elsif ($self->{nc} == 0x003E) { # >
8252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8253     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8255    
8256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8257     $self->{line_prev} = $self->{line};
8258     $self->{column_prev} = $self->{column};
8259     $self->{column}++;
8260     $self->{nc}
8261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8262     } else {
8263     $self->{set_nc}->($self);
8264     }
8265    
8266     return ($self->{ct}); # ELEMENT
8267     redo A;
8268     } elsif ($self->{nc} == -1) {
8269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8270     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272    
8273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274     $self->{line_prev} = $self->{line};
8275     $self->{column_prev} = $self->{column};
8276     $self->{column}++;
8277     $self->{nc}
8278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279     } else {
8280     $self->{set_nc}->($self);
8281     }
8282    
8283     return ($self->{ct}); # ELEMENT
8284     redo A;
8285     } else {
8286     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8287     ## Stay in the state.
8288    
8289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290     $self->{line_prev} = $self->{line};
8291     $self->{column_prev} = $self->{column};
8292     $self->{column}++;
8293     $self->{nc}
8294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295     } else {
8296     $self->{set_nc}->($self);
8297     }
8298    
8299     redo A;
8300     }
8301     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8302     if ($is_space->{$self->{nc}}) {
8303     ## Stay in the state.
8304    
8305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306     $self->{line_prev} = $self->{line};
8307     $self->{column_prev} = $self->{column};
8308     $self->{column}++;
8309     $self->{nc}
8310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311     } else {
8312     $self->{set_nc}->($self);
8313     }
8314    
8315     redo A;
8316     } elsif ($self->{nc} == 0x007C or # |
8317     $self->{nc} == 0x002C) { # ,
8318     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8319     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8320    
8321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322     $self->{line_prev} = $self->{line};
8323     $self->{column_prev} = $self->{column};
8324     $self->{column}++;
8325     $self->{nc}
8326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327     } else {
8328     $self->{set_nc}->($self);
8329     }
8330    
8331     redo A;
8332     } elsif ($self->{nc} == 0x0029) { # )
8333     $self->{group_depth}--;
8334     push @{$self->{ct}->{content}}, chr $self->{nc};
8335     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8336    
8337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338     $self->{line_prev} = $self->{line};
8339     $self->{column_prev} = $self->{column};
8340     $self->{column}++;
8341     $self->{nc}
8342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343     } else {
8344     $self->{set_nc}->($self);
8345     }
8346    
8347     redo A;
8348     } elsif ($self->{nc} == 0x003E) { # >
8349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8350     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8351     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8352    
8353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8354     $self->{line_prev} = $self->{line};
8355     $self->{column_prev} = $self->{column};
8356     $self->{column}++;
8357     $self->{nc}
8358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8359     } else {
8360     $self->{set_nc}->($self);
8361     }
8362    
8363     return ($self->{ct}); # ELEMENT
8364     redo A;
8365     } elsif ($self->{nc} == -1) {
8366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8367     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369    
8370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371     $self->{line_prev} = $self->{line};
8372     $self->{column_prev} = $self->{column};
8373     $self->{column}++;
8374     $self->{nc}
8375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376     } else {
8377     $self->{set_nc}->($self);
8378     }
8379    
8380     return ($self->{ct}); # ELEMENT
8381     redo A;
8382     } else {
8383     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8384     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385     $self->{state} = BOGUS_MD_STATE;
8386    
8387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388     $self->{line_prev} = $self->{line};
8389     $self->{column_prev} = $self->{column};
8390     $self->{column}++;
8391     $self->{nc}
8392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393     } else {
8394     $self->{set_nc}->($self);
8395     }
8396    
8397     redo A;
8398     }
8399     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8400     if ($is_space->{$self->{nc}}) {
8401     if ($self->{group_depth}) {
8402     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8403     } else {
8404     $self->{state} = AFTER_MD_DEF_STATE;
8405     }
8406    
8407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8408     $self->{line_prev} = $self->{line};
8409     $self->{column_prev} = $self->{column};
8410     $self->{column}++;
8411     $self->{nc}
8412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8413     } else {
8414     $self->{set_nc}->($self);
8415     }
8416    
8417     redo A;
8418     } elsif ($self->{nc} == 0x002A or # *
8419     $self->{nc} == 0x002B or # +
8420     $self->{nc} == 0x003F) { # ?
8421     push @{$self->{ct}->{content}}, chr $self->{nc};
8422     if ($self->{group_depth}) {
8423     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8424     } else {
8425     $self->{state} = AFTER_MD_DEF_STATE;
8426     }
8427    
8428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8429     $self->{line_prev} = $self->{line};
8430     $self->{column_prev} = $self->{column};
8431     $self->{column}++;
8432     $self->{nc}
8433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8434     } else {
8435     $self->{set_nc}->($self);
8436     }
8437    
8438     redo A;
8439     } elsif ($self->{nc} == 0x0029) { # )
8440     if ($self->{group_depth}) {
8441     $self->{group_depth}--;
8442     push @{$self->{ct}->{content}}, chr $self->{nc};
8443     ## Stay in the state.
8444    
8445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8446     $self->{line_prev} = $self->{line};
8447     $self->{column_prev} = $self->{column};
8448     $self->{column}++;
8449     $self->{nc}
8450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8451     } else {
8452     $self->{set_nc}->($self);
8453     }
8454    
8455     redo A;
8456     } else {
8457     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8458     $self->{state} = BOGUS_MD_STATE;
8459     ## Reconsume.
8460     redo A;
8461     }
8462     } elsif ($self->{nc} == 0x003E) { # >
8463     if ($self->{group_depth}) {
8464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8465     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8466     }
8467     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8468    
8469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8470     $self->{line_prev} = $self->{line};
8471     $self->{column_prev} = $self->{column};
8472     $self->{column}++;
8473     $self->{nc}
8474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8475     } else {
8476     $self->{set_nc}->($self);
8477     }
8478    
8479     return ($self->{ct}); # ELEMENT
8480     redo A;
8481     } elsif ($self->{nc} == -1) {
8482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8483     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8484     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485    
8486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487     $self->{line_prev} = $self->{line};
8488     $self->{column_prev} = $self->{column};
8489     $self->{column}++;
8490     $self->{nc}
8491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492     } else {
8493     $self->{set_nc}->($self);
8494     }
8495    
8496     return ($self->{ct}); # ELEMENT
8497     redo A;
8498     } else {
8499     if ($self->{group_depth}) {
8500     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8501     } else {
8502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8503     $self->{state} = BOGUS_MD_STATE;
8504     }
8505     ## Reconsume.
8506     redo A;
8507     }
8508     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8509 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8510     ## Stay in the state.
8511    
8512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8513     $self->{line_prev} = $self->{line};
8514     $self->{column_prev} = $self->{column};
8515     $self->{column}++;
8516     $self->{nc}
8517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8518     } else {
8519     $self->{set_nc}->($self);
8520     }
8521    
8522     redo A;
8523     } elsif ($self->{nc} == 0x003E) { # >
8524     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8525    
8526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8527     $self->{line_prev} = $self->{line};
8528     $self->{column_prev} = $self->{column};
8529     $self->{column}++;
8530     $self->{nc}
8531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8532     } else {
8533     $self->{set_nc}->($self);
8534     }
8535    
8536 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8537 wakaba 1.18 redo A;
8538     } elsif ($self->{nc} == -1) {
8539     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8541    
8542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543     $self->{line_prev} = $self->{line};
8544     $self->{column_prev} = $self->{column};
8545     $self->{column}++;
8546     $self->{nc}
8547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548     } else {
8549     $self->{set_nc}->($self);
8550     }
8551    
8552 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8553 wakaba 1.18 redo A;
8554     } else {
8555 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8556 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8557     ## Reconsume.
8558     redo A;
8559     }
8560 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8561     if ($self->{nc} == 0x003E) { # >
8562     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8563    
8564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8565     $self->{line_prev} = $self->{line};
8566     $self->{column_prev} = $self->{column};
8567     $self->{column}++;
8568     $self->{nc}
8569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8570     } else {
8571     $self->{set_nc}->($self);
8572     }
8573    
8574     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8575     redo A;
8576     } elsif ($self->{nc} == -1) {
8577     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8578     ## Reconsume.
8579     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8580     redo A;
8581     } else {
8582     ## Stay in the state.
8583    
8584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8585     $self->{line_prev} = $self->{line};
8586     $self->{column_prev} = $self->{column};
8587     $self->{column}++;
8588     $self->{nc}
8589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8590     } else {
8591     $self->{set_nc}->($self);
8592     }
8593    
8594     redo A;
8595     }
8596 wakaba 1.1 } else {
8597     die "$0: $self->{state}: Unknown state";
8598     }
8599     } # A
8600    
8601     die "$0: _get_next_token: unexpected case";
8602     } # _get_next_token
8603    
8604     1;
8605 wakaba 1.21 ## $Date: 2008/10/19 08:20:29 $
8606 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24