/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (hide annotations) (download)
Sun Oct 19 10:12:54 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.21: +25 -16 lines
++ whatpm/t/ChangeLog	19 Oct 2008 10:12:26 -0000
	* XML-Parser.t: "xml/entrefs-2.dat" added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 10:12:39 -0000
	* entrefs-2.dat: New test data file.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 10:11:55 -0000
	* Tokenizer.pm.src: Raise a parse error for '&' that does not
	introduce a reference in XML.  Support for non-ASCII entity
	reference names.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.22 our $VERSION=do{my @r=(q$Revision: 1.21 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743     if ($self->{nc} == 0x003D) { # =
1744    
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819     } elsif ($self->{nc} == -1) {
1820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822    
1823     $self->{last_stag_name} = $self->{ct}->{tag_name};
1824 wakaba 1.15
1825     $self->{state} = DATA_STATE;
1826     $self->{s_kwd} = '';
1827     ## reconsume
1828     return ($self->{ct}); # start tag
1829     redo A;
1830 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832     if ($self->{ct}->{attributes}) {
1833    
1834     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835     } else {
1836     ## NOTE: This state should never be reached.
1837    
1838     }
1839 wakaba 1.15
1840     $self->{state} = DATA_STATE;
1841     $self->{s_kwd} = '';
1842     ## reconsume
1843     return ($self->{ct}); # end tag
1844     redo A;
1845     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846     ## XML5: No parse error above; not defined yet.
1847     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849     ## Reconsume.
1850     return ($self->{ct}); # ATTLIST
1851     redo A;
1852 wakaba 1.1 } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     } else {
1856 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1857 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858    
1859     ## XML5: Not a parse error.
1860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861     } else {
1862    
1863     }
1864 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1865     $self->{read_until}->($self->{ca}->{value},
1866 wakaba 1.11 q["&<],
1867 wakaba 1.1 length $self->{ca}->{value});
1868    
1869     ## Stay in the state
1870    
1871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872     $self->{line_prev} = $self->{line};
1873     $self->{column_prev} = $self->{column};
1874     $self->{column}++;
1875     $self->{nc}
1876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877     } else {
1878     $self->{set_nc}->($self);
1879     }
1880    
1881     redo A;
1882     }
1883     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885     ## ATTLIST attribute value single quoted state".
1886 wakaba 1.11
1887 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1888 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889    
1890     ## XML5: "DOCTYPE ATTLIST name after state".
1891     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893     } else {
1894    
1895     ## XML5: "Before attribute name state" (sic).
1896     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897     }
1898 wakaba 1.1
1899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900     $self->{line_prev} = $self->{line};
1901     $self->{column_prev} = $self->{column};
1902     $self->{column}++;
1903     $self->{nc}
1904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905     } else {
1906     $self->{set_nc}->($self);
1907     }
1908    
1909     redo A;
1910     } elsif ($self->{nc} == 0x0026) { # &
1911    
1912 wakaba 1.11 ## XML5: Not defined yet.
1913    
1914 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1915     ## "entity in attribute value state". In this implementation, the
1916     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917     ## implementation of the "consume a character reference" algorithm.
1918     $self->{entity_add} = 0x0027; # '
1919     $self->{prev_state} = $self->{state};
1920     $self->{state} = ENTITY_STATE;
1921    
1922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923     $self->{line_prev} = $self->{line};
1924     $self->{column_prev} = $self->{column};
1925     $self->{column}++;
1926     $self->{nc}
1927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928     } else {
1929     $self->{set_nc}->($self);
1930     }
1931    
1932     redo A;
1933     } elsif ($self->{nc} == -1) {
1934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936    
1937     $self->{last_stag_name} = $self->{ct}->{tag_name};
1938 wakaba 1.15
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     ## reconsume
1942     return ($self->{ct}); # start tag
1943     redo A;
1944 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946     if ($self->{ct}->{attributes}) {
1947    
1948     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949     } else {
1950     ## NOTE: This state should never be reached.
1951    
1952     }
1953 wakaba 1.15
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     ## reconsume
1957     return ($self->{ct}); # end tag
1958     redo A;
1959     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960     ## XML5: No parse error above; not defined yet.
1961     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963     ## Reconsume.
1964     return ($self->{ct}); # ATTLIST
1965     redo A;
1966 wakaba 1.1 } else {
1967     die "$0: $self->{ct}->{type}: Unknown token type";
1968     }
1969     } else {
1970 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1971 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972    
1973     ## XML5: Not a parse error.
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975     } else {
1976    
1977     }
1978 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1979     $self->{read_until}->($self->{ca}->{value},
1980 wakaba 1.11 q['&<],
1981 wakaba 1.1 length $self->{ca}->{value});
1982    
1983     ## Stay in the state
1984    
1985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986     $self->{line_prev} = $self->{line};
1987     $self->{column_prev} = $self->{column};
1988     $self->{column}++;
1989     $self->{nc}
1990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991     } else {
1992     $self->{set_nc}->($self);
1993     }
1994    
1995     redo A;
1996     }
1997     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1999    
2000 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2001 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002    
2003     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005     } else {
2006    
2007     ## XML5: "Tag attribute name before state".
2008     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009     }
2010 wakaba 1.1
2011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012     $self->{line_prev} = $self->{line};
2013     $self->{column_prev} = $self->{column};
2014     $self->{column}++;
2015     $self->{nc}
2016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017     } else {
2018     $self->{set_nc}->($self);
2019     }
2020    
2021     redo A;
2022     } elsif ($self->{nc} == 0x0026) { # &
2023    
2024 wakaba 1.11
2025     ## XML5: Not defined yet.
2026    
2027 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2028     ## "entity in attribute value state". In this implementation, the
2029     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030     ## implementation of the "consume a character reference" algorithm.
2031     $self->{entity_add} = -1;
2032     $self->{prev_state} = $self->{state};
2033     $self->{state} = ENTITY_STATE;
2034    
2035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036     $self->{line_prev} = $self->{line};
2037     $self->{column_prev} = $self->{column};
2038     $self->{column}++;
2039     $self->{nc}
2040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041     } else {
2042     $self->{set_nc}->($self);
2043     }
2044    
2045     redo A;
2046     } elsif ($self->{nc} == 0x003E) { # >
2047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048    
2049     $self->{last_stag_name} = $self->{ct}->{tag_name};
2050 wakaba 1.15
2051     $self->{state} = DATA_STATE;
2052     $self->{s_kwd} = '';
2053    
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     return ($self->{ct}); # start tag
2065     redo A;
2066 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068     if ($self->{ct}->{attributes}) {
2069    
2070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071     } else {
2072     ## NOTE: This state should never be reached.
2073    
2074     }
2075 wakaba 1.15
2076     $self->{state} = DATA_STATE;
2077     $self->{s_kwd} = '';
2078    
2079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080     $self->{line_prev} = $self->{line};
2081     $self->{column_prev} = $self->{column};
2082     $self->{column}++;
2083     $self->{nc}
2084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085     } else {
2086     $self->{set_nc}->($self);
2087     }
2088    
2089     return ($self->{ct}); # end tag
2090     redo A;
2091     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094    
2095 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096     $self->{line_prev} = $self->{line};
2097     $self->{column_prev} = $self->{column};
2098     $self->{column}++;
2099     $self->{nc}
2100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101     } else {
2102     $self->{set_nc}->($self);
2103     }
2104    
2105 wakaba 1.15 return ($self->{ct}); # ATTLIST
2106     redo A;
2107     } else {
2108     die "$0: $self->{ct}->{type}: Unknown token type";
2109     }
2110 wakaba 1.1 } elsif ($self->{nc} == -1) {
2111     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112    
2113 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115 wakaba 1.15
2116     $self->{state} = DATA_STATE;
2117     $self->{s_kwd} = '';
2118     ## reconsume
2119     return ($self->{ct}); # start tag
2120     redo A;
2121 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124     if ($self->{ct}->{attributes}) {
2125    
2126     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127     } else {
2128     ## NOTE: This state should never be reached.
2129    
2130     }
2131 wakaba 1.15
2132     $self->{state} = DATA_STATE;
2133     $self->{s_kwd} = '';
2134     ## reconsume
2135     return ($self->{ct}); # end tag
2136     redo A;
2137     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141     ## Reconsume.
2142     return ($self->{ct}); # ATTLIST
2143     redo A;
2144 wakaba 1.1 } else {
2145     die "$0: $self->{ct}->{type}: Unknown token type";
2146     }
2147     } else {
2148     if ({
2149     0x0022 => 1, # "
2150     0x0027 => 1, # '
2151     0x003D => 1, # =
2152     }->{$self->{nc}}) {
2153    
2154 wakaba 1.11 ## XML5: Not a parse error.
2155 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156     } else {
2157    
2158     }
2159     $self->{ca}->{value} .= chr ($self->{nc});
2160     $self->{read_until}->($self->{ca}->{value},
2161     q["'=& >],
2162     length $self->{ca}->{value});
2163    
2164     ## Stay in the state
2165    
2166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167     $self->{line_prev} = $self->{line};
2168     $self->{column_prev} = $self->{column};
2169     $self->{column}++;
2170     $self->{nc}
2171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172     } else {
2173     $self->{set_nc}->($self);
2174     }
2175    
2176     redo A;
2177     }
2178     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179     if ($is_space->{$self->{nc}}) {
2180    
2181     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182    
2183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184     $self->{line_prev} = $self->{line};
2185     $self->{column_prev} = $self->{column};
2186     $self->{column}++;
2187     $self->{nc}
2188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189     } else {
2190     $self->{set_nc}->($self);
2191     }
2192    
2193     redo A;
2194     } elsif ($self->{nc} == 0x003E) { # >
2195     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196    
2197     $self->{last_stag_name} = $self->{ct}->{tag_name};
2198     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200     if ($self->{ct}->{attributes}) {
2201    
2202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203     } else {
2204     ## NOTE: This state should never be reached.
2205    
2206     }
2207     } else {
2208     die "$0: $self->{ct}->{type}: Unknown token type";
2209     }
2210     $self->{state} = DATA_STATE;
2211 wakaba 1.5 $self->{s_kwd} = '';
2212 wakaba 1.1
2213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214     $self->{line_prev} = $self->{line};
2215     $self->{column_prev} = $self->{column};
2216     $self->{column}++;
2217     $self->{nc}
2218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219     } else {
2220     $self->{set_nc}->($self);
2221     }
2222    
2223    
2224     return ($self->{ct}); # start tag or end tag
2225    
2226     redo A;
2227     } elsif ($self->{nc} == 0x002F) { # /
2228    
2229     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230    
2231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232     $self->{line_prev} = $self->{line};
2233     $self->{column_prev} = $self->{column};
2234     $self->{column}++;
2235     $self->{nc}
2236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237     } else {
2238     $self->{set_nc}->($self);
2239     }
2240    
2241     redo A;
2242     } elsif ($self->{nc} == -1) {
2243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245    
2246     $self->{last_stag_name} = $self->{ct}->{tag_name};
2247     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248     if ($self->{ct}->{attributes}) {
2249    
2250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251     } else {
2252     ## NOTE: This state should never be reached.
2253    
2254     }
2255     } else {
2256     die "$0: $self->{ct}->{type}: Unknown token type";
2257     }
2258     $self->{state} = DATA_STATE;
2259 wakaba 1.5 $self->{s_kwd} = '';
2260 wakaba 1.1 ## Reconsume.
2261     return ($self->{ct}); # start tag or end tag
2262     redo A;
2263     } else {
2264    
2265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267     ## reconsume
2268     redo A;
2269     }
2270     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 wakaba 1.11 ## XML5: "Empty tag state".
2272    
2273 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2274     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275    
2276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277     ## TODO: Different type than slash in start tag
2278     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279     if ($self->{ct}->{attributes}) {
2280    
2281     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282     } else {
2283    
2284     }
2285     ## TODO: Test |<title></title/>|
2286     } else {
2287    
2288     $self->{self_closing} = 1;
2289     }
2290    
2291     $self->{state} = DATA_STATE;
2292 wakaba 1.5 $self->{s_kwd} = '';
2293 wakaba 1.1
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304    
2305     return ($self->{ct}); # start tag or end tag
2306    
2307     redo A;
2308     } elsif ($self->{nc} == -1) {
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311    
2312     $self->{last_stag_name} = $self->{ct}->{tag_name};
2313     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318     ## NOTE: This state should never be reached.
2319    
2320     }
2321     } else {
2322     die "$0: $self->{ct}->{type}: Unknown token type";
2323     }
2324 wakaba 1.11 ## XML5: "Tag attribute name before state".
2325 wakaba 1.1 $self->{state} = DATA_STATE;
2326 wakaba 1.5 $self->{s_kwd} = '';
2327 wakaba 1.1 ## Reconsume.
2328     return ($self->{ct}); # start tag or end tag
2329     redo A;
2330     } else {
2331    
2332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333     ## TODO: This error type is wrong.
2334     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335     ## Reconsume.
2336     redo A;
2337     }
2338     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340    
2341 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342     ## consumes characters one-by-one basis.
2343    
2344     if ($self->{nc} == 0x003E) { # >
2345 wakaba 1.13 if ($self->{in_subset}) {
2346    
2347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348     } else {
2349    
2350     $self->{state} = DATA_STATE;
2351     $self->{s_kwd} = '';
2352     }
2353 wakaba 1.1
2354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355     $self->{line_prev} = $self->{line};
2356     $self->{column_prev} = $self->{column};
2357     $self->{column}++;
2358     $self->{nc}
2359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360     } else {
2361     $self->{set_nc}->($self);
2362     }
2363    
2364    
2365     return ($self->{ct}); # comment
2366     redo A;
2367     } elsif ($self->{nc} == -1) {
2368 wakaba 1.13 if ($self->{in_subset}) {
2369    
2370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371     } else {
2372    
2373     $self->{state} = DATA_STATE;
2374     $self->{s_kwd} = '';
2375     }
2376 wakaba 1.1 ## reconsume
2377    
2378     return ($self->{ct}); # comment
2379     redo A;
2380     } else {
2381    
2382     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383     $self->{read_until}->($self->{ct}->{data},
2384     q[>],
2385     length $self->{ct}->{data});
2386    
2387     ## Stay in the state.
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399     redo A;
2400     }
2401     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 wakaba 1.14 ## XML5: "Markup declaration state".
2403 wakaba 1.1
2404     if ($self->{nc} == 0x002D) { # -
2405    
2406     $self->{state} = MD_HYPHEN_STATE;
2407    
2408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409     $self->{line_prev} = $self->{line};
2410     $self->{column_prev} = $self->{column};
2411     $self->{column}++;
2412     $self->{nc}
2413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414     } else {
2415     $self->{set_nc}->($self);
2416     }
2417    
2418     redo A;
2419     } elsif ($self->{nc} == 0x0044 or # D
2420     $self->{nc} == 0x0064) { # d
2421     ## ASCII case-insensitive.
2422    
2423     $self->{state} = MD_DOCTYPE_STATE;
2424 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2425 wakaba 1.1
2426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427     $self->{line_prev} = $self->{line};
2428     $self->{column_prev} = $self->{column};
2429     $self->{column}++;
2430     $self->{nc}
2431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432     } else {
2433     $self->{set_nc}->($self);
2434     }
2435    
2436     redo A;
2437 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439     $self->{is_xml}) and
2440 wakaba 1.1 $self->{nc} == 0x005B) { # [
2441    
2442     $self->{state} = MD_CDATA_STATE;
2443 wakaba 1.12 $self->{kwd} = '[';
2444 wakaba 1.1
2445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446     $self->{line_prev} = $self->{line};
2447     $self->{column_prev} = $self->{column};
2448     $self->{column}++;
2449     $self->{nc}
2450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451     } else {
2452     $self->{set_nc}->($self);
2453     }
2454    
2455     redo A;
2456     } else {
2457    
2458     }
2459    
2460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461     line => $self->{line_prev},
2462     column => $self->{column_prev} - 1);
2463     ## Reconsume.
2464     $self->{state} = BOGUS_COMMENT_STATE;
2465     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466     line => $self->{line_prev},
2467     column => $self->{column_prev} - 1,
2468     };
2469     redo A;
2470     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471     if ($self->{nc} == 0x002D) { # -
2472    
2473     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474     line => $self->{line_prev},
2475     column => $self->{column_prev} - 2,
2476     };
2477 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478 wakaba 1.1
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } else {
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493     line => $self->{line_prev},
2494     column => $self->{column_prev} - 2);
2495     $self->{state} = BOGUS_COMMENT_STATE;
2496     ## Reconsume.
2497     $self->{ct} = {type => COMMENT_TOKEN,
2498     data => '-',
2499     line => $self->{line_prev},
2500     column => $self->{column_prev} - 2,
2501     };
2502     redo A;
2503     }
2504     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505     ## ASCII case-insensitive.
2506     if ($self->{nc} == [
2507     undef,
2508     0x004F, # O
2509     0x0043, # C
2510     0x0054, # T
2511     0x0059, # Y
2512     0x0050, # P
2513 wakaba 1.12 ]->[length $self->{kwd}] or
2514 wakaba 1.1 $self->{nc} == [
2515     undef,
2516     0x006F, # o
2517     0x0063, # c
2518     0x0074, # t
2519     0x0079, # y
2520     0x0070, # p
2521 wakaba 1.12 ]->[length $self->{kwd}]) {
2522 wakaba 1.1
2523     ## Stay in the state.
2524 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2525 wakaba 1.1
2526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527     $self->{line_prev} = $self->{line};
2528     $self->{column_prev} = $self->{column};
2529     $self->{column}++;
2530     $self->{nc}
2531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532     } else {
2533     $self->{set_nc}->($self);
2534     }
2535    
2536     redo A;
2537 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2538 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2539     $self->{nc} == 0x0065)) { # e
2540 wakaba 1.12 if ($self->{is_xml} and
2541     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542 wakaba 1.10
2543     ## XML5: case-sensitive.
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545     text => 'DOCTYPE',
2546     line => $self->{line_prev},
2547     column => $self->{column_prev} - 5);
2548     } else {
2549    
2550     }
2551 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2552     $self->{ct} = {type => DOCTYPE_TOKEN,
2553     quirks => 1,
2554     line => $self->{line_prev},
2555     column => $self->{column_prev} - 7,
2556     };
2557    
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568     redo A;
2569     } else {
2570    
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572     line => $self->{line_prev},
2573 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2575     ## Reconsume.
2576     $self->{ct} = {type => COMMENT_TOKEN,
2577 wakaba 1.12 data => $self->{kwd},
2578 wakaba 1.1 line => $self->{line_prev},
2579 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 wakaba 1.1 };
2581     redo A;
2582     }
2583     } elsif ($self->{state} == MD_CDATA_STATE) {
2584     if ($self->{nc} == {
2585     '[' => 0x0043, # C
2586     '[C' => 0x0044, # D
2587     '[CD' => 0x0041, # A
2588     '[CDA' => 0x0054, # T
2589     '[CDAT' => 0x0041, # A
2590 wakaba 1.12 }->{$self->{kwd}}) {
2591 wakaba 1.1
2592     ## Stay in the state.
2593 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2594 wakaba 1.1
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605     redo A;
2606 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2607 wakaba 1.1 $self->{nc} == 0x005B) { # [
2608 wakaba 1.6 if ($self->{is_xml} and
2609     not $self->{tainted} and
2610     @{$self->{open_elements} or []} == 0) {
2611 wakaba 1.8
2612 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613     line => $self->{line_prev},
2614     column => $self->{column_prev} - 7);
2615     $self->{tainted} = 1;
2616 wakaba 1.8 } else {
2617    
2618 wakaba 1.6 }
2619    
2620 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2621     data => '',
2622     line => $self->{line_prev},
2623     column => $self->{column_prev} - 7};
2624     $self->{state} = CDATA_SECTION_STATE;
2625    
2626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627     $self->{line_prev} = $self->{line};
2628     $self->{column_prev} = $self->{column};
2629     $self->{column}++;
2630     $self->{nc}
2631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632     } else {
2633     $self->{set_nc}->($self);
2634     }
2635    
2636     redo A;
2637     } else {
2638    
2639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640     line => $self->{line_prev},
2641 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2643     ## Reconsume.
2644     $self->{ct} = {type => COMMENT_TOKEN,
2645 wakaba 1.12 data => $self->{kwd},
2646 wakaba 1.1 line => $self->{line_prev},
2647 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 wakaba 1.1 };
2649     redo A;
2650     }
2651     } elsif ($self->{state} == COMMENT_START_STATE) {
2652     if ($self->{nc} == 0x002D) { # -
2653    
2654     $self->{state} = COMMENT_START_DASH_STATE;
2655    
2656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657     $self->{line_prev} = $self->{line};
2658     $self->{column_prev} = $self->{column};
2659     $self->{column}++;
2660     $self->{nc}
2661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662     } else {
2663     $self->{set_nc}->($self);
2664     }
2665    
2666     redo A;
2667     } elsif ($self->{nc} == 0x003E) { # >
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 wakaba 1.13 if ($self->{in_subset}) {
2670    
2671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672     } else {
2673    
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     }
2677 wakaba 1.1
2678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679     $self->{line_prev} = $self->{line};
2680     $self->{column_prev} = $self->{column};
2681     $self->{column}++;
2682     $self->{nc}
2683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684     } else {
2685     $self->{set_nc}->($self);
2686     }
2687    
2688    
2689     return ($self->{ct}); # comment
2690    
2691     redo A;
2692     } elsif ($self->{nc} == -1) {
2693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 wakaba 1.13 if ($self->{in_subset}) {
2695    
2696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697     } else {
2698    
2699     $self->{state} = DATA_STATE;
2700     $self->{s_kwd} = '';
2701     }
2702 wakaba 1.1 ## reconsume
2703    
2704     return ($self->{ct}); # comment
2705    
2706     redo A;
2707     } else {
2708    
2709     $self->{ct}->{data} # comment
2710     .= chr ($self->{nc});
2711     $self->{state} = COMMENT_STATE;
2712    
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723     redo A;
2724     }
2725     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726     if ($self->{nc} == 0x002D) { # -
2727    
2728     $self->{state} = COMMENT_END_STATE;
2729    
2730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731     $self->{line_prev} = $self->{line};
2732     $self->{column_prev} = $self->{column};
2733     $self->{column}++;
2734     $self->{nc}
2735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736     } else {
2737     $self->{set_nc}->($self);
2738     }
2739    
2740     redo A;
2741     } elsif ($self->{nc} == 0x003E) { # >
2742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 wakaba 1.13 if ($self->{in_subset}) {
2744    
2745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746     } else {
2747    
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     }
2751 wakaba 1.1
2752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753     $self->{line_prev} = $self->{line};
2754     $self->{column_prev} = $self->{column};
2755     $self->{column}++;
2756     $self->{nc}
2757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758     } else {
2759     $self->{set_nc}->($self);
2760     }
2761    
2762    
2763     return ($self->{ct}); # comment
2764    
2765     redo A;
2766     } elsif ($self->{nc} == -1) {
2767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 wakaba 1.13 if ($self->{in_subset}) {
2769    
2770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771     } else {
2772    
2773     $self->{state} = DATA_STATE;
2774     $self->{s_kwd} = '';
2775     }
2776 wakaba 1.1 ## reconsume
2777    
2778     return ($self->{ct}); # comment
2779    
2780     redo A;
2781     } else {
2782    
2783     $self->{ct}->{data} # comment
2784     .= '-' . chr ($self->{nc});
2785     $self->{state} = COMMENT_STATE;
2786    
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797     redo A;
2798     }
2799     } elsif ($self->{state} == COMMENT_STATE) {
2800 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2801    
2802 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2803    
2804     $self->{state} = COMMENT_END_DASH_STATE;
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } elsif ($self->{nc} == -1) {
2818     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 wakaba 1.13 if ($self->{in_subset}) {
2820    
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     } else {
2823    
2824     $self->{state} = DATA_STATE;
2825     $self->{s_kwd} = '';
2826     }
2827 wakaba 1.1 ## reconsume
2828    
2829     return ($self->{ct}); # comment
2830    
2831     redo A;
2832     } else {
2833    
2834     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835     $self->{read_until}->($self->{ct}->{data},
2836     q[-],
2837     length $self->{ct}->{data});
2838    
2839     ## Stay in the state
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     }
2853     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855 wakaba 1.10
2856 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2857    
2858     $self->{state} = COMMENT_END_STATE;
2859    
2860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861     $self->{line_prev} = $self->{line};
2862     $self->{column_prev} = $self->{column};
2863     $self->{column}++;
2864     $self->{nc}
2865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866     } else {
2867     $self->{set_nc}->($self);
2868     }
2869    
2870     redo A;
2871     } elsif ($self->{nc} == -1) {
2872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 wakaba 1.13 if ($self->{in_subset}) {
2874    
2875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876     } else {
2877    
2878     $self->{state} = DATA_STATE;
2879     $self->{s_kwd} = '';
2880     }
2881 wakaba 1.1 ## reconsume
2882    
2883     return ($self->{ct}); # comment
2884    
2885     redo A;
2886     } else {
2887    
2888     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889     $self->{state} = COMMENT_STATE;
2890    
2891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892     $self->{line_prev} = $self->{line};
2893     $self->{column_prev} = $self->{column};
2894     $self->{column}++;
2895     $self->{nc}
2896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897     } else {
2898     $self->{set_nc}->($self);
2899     }
2900    
2901     redo A;
2902     }
2903     } elsif ($self->{state} == COMMENT_END_STATE) {
2904 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905    
2906 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2907 wakaba 1.13 if ($self->{in_subset}) {
2908    
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910     } else {
2911    
2912     $self->{state} = DATA_STATE;
2913     $self->{s_kwd} = '';
2914     }
2915 wakaba 1.1
2916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917     $self->{line_prev} = $self->{line};
2918     $self->{column_prev} = $self->{column};
2919     $self->{column}++;
2920     $self->{nc}
2921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922     } else {
2923     $self->{set_nc}->($self);
2924     }
2925    
2926    
2927     return ($self->{ct}); # comment
2928    
2929     redo A;
2930     } elsif ($self->{nc} == 0x002D) { # -
2931    
2932 wakaba 1.10 ## XML5: Not a parse error.
2933 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934     line => $self->{line_prev},
2935     column => $self->{column_prev});
2936     $self->{ct}->{data} .= '-'; # comment
2937     ## Stay in the state
2938    
2939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940     $self->{line_prev} = $self->{line};
2941     $self->{column_prev} = $self->{column};
2942     $self->{column}++;
2943     $self->{nc}
2944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945     } else {
2946     $self->{set_nc}->($self);
2947     }
2948    
2949     redo A;
2950     } elsif ($self->{nc} == -1) {
2951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1 ## reconsume
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } else {
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972     $self->{state} = COMMENT_STATE;
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     }
2986     } elsif ($self->{state} == DOCTYPE_STATE) {
2987     if ($is_space->{$self->{nc}}) {
2988    
2989     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990    
2991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002     } else {
3003    
3004 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3005 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007     ## reconsume
3008     redo A;
3009     }
3010     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3012    
3013 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3014    
3015     ## Stay in the state
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     } elsif ($self->{nc} == 0x003E) { # >
3029    
3030 wakaba 1.12 ## XML5: No parse error.
3031 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032     $self->{state} = DATA_STATE;
3033 wakaba 1.5 $self->{s_kwd} = '';
3034 wakaba 1.1
3035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036     $self->{line_prev} = $self->{line};
3037     $self->{column_prev} = $self->{column};
3038     $self->{column}++;
3039     $self->{nc}
3040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041     } else {
3042     $self->{set_nc}->($self);
3043     }
3044    
3045    
3046     return ($self->{ct}); # DOCTYPE (quirks)
3047    
3048     redo A;
3049     } elsif ($self->{nc} == -1) {
3050    
3051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052     $self->{state} = DATA_STATE;
3053 wakaba 1.5 $self->{s_kwd} = '';
3054 wakaba 1.1 ## reconsume
3055    
3056     return ($self->{ct}); # DOCTYPE (quirks)
3057    
3058     redo A;
3059 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060    
3061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064     $self->{in_subset} = 1;
3065 wakaba 1.12
3066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067     $self->{line_prev} = $self->{line};
3068     $self->{column_prev} = $self->{column};
3069     $self->{column}++;
3070     $self->{nc}
3071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072     } else {
3073     $self->{set_nc}->($self);
3074     }
3075    
3076 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3077 wakaba 1.12 redo A;
3078 wakaba 1.1 } else {
3079    
3080     $self->{ct}->{name} = chr $self->{nc};
3081     delete $self->{ct}->{quirks};
3082     $self->{state} = DOCTYPE_NAME_STATE;
3083    
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094     redo A;
3095     }
3096     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3098    
3099     ## ISSUE: Redundant "First," in the spec.
3100    
3101 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3102    
3103     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x003E) { # >
3117    
3118     $self->{state} = DATA_STATE;
3119 wakaba 1.5 $self->{s_kwd} = '';
3120 wakaba 1.1
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131    
3132     return ($self->{ct}); # DOCTYPE
3133    
3134     redo A;
3135     } elsif ($self->{nc} == -1) {
3136    
3137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138     $self->{state} = DATA_STATE;
3139 wakaba 1.5 $self->{s_kwd} = '';
3140 wakaba 1.1 ## reconsume
3141    
3142     $self->{ct}->{quirks} = 1;
3143     return ($self->{ct}); # DOCTYPE
3144    
3145     redo A;
3146 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147    
3148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150     $self->{in_subset} = 1;
3151 wakaba 1.12
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3163 wakaba 1.12 redo A;
3164 wakaba 1.1 } else {
3165    
3166     $self->{ct}->{name}
3167     .= chr ($self->{nc}); # DOCTYPE
3168     ## Stay in the state
3169    
3170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171     $self->{line_prev} = $self->{line};
3172     $self->{column_prev} = $self->{column};
3173     $self->{column}++;
3174     $self->{nc}
3175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176     } else {
3177     $self->{set_nc}->($self);
3178     }
3179    
3180     redo A;
3181     }
3182     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184     ## state", but implemented differently.
3185    
3186 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3187    
3188     ## Stay in the state
3189    
3190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191     $self->{line_prev} = $self->{line};
3192     $self->{column_prev} = $self->{column};
3193     $self->{column}++;
3194     $self->{nc}
3195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196     } else {
3197     $self->{set_nc}->($self);
3198     }
3199    
3200     redo A;
3201     } elsif ($self->{nc} == 0x003E) { # >
3202 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203    
3204     $self->{state} = DATA_STATE;
3205     $self->{s_kwd} = '';
3206     } else {
3207    
3208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210     }
3211 wakaba 1.1
3212    
3213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214     $self->{line_prev} = $self->{line};
3215     $self->{column_prev} = $self->{column};
3216     $self->{column}++;
3217     $self->{nc}
3218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219     } else {
3220     $self->{set_nc}->($self);
3221     }
3222    
3223 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 wakaba 1.1 redo A;
3225     } elsif ($self->{nc} == -1) {
3226 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227    
3228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229     $self->{state} = DATA_STATE;
3230     $self->{s_kwd} = '';
3231     $self->{ct}->{quirks} = 1;
3232     } else {
3233    
3234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236     }
3237 wakaba 1.1
3238 wakaba 1.16 ## Reconsume.
3239     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 wakaba 1.1 redo A;
3241     } elsif ($self->{nc} == 0x0050 or # P
3242     $self->{nc} == 0x0070) { # p
3243 wakaba 1.12
3244 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3245 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3246 wakaba 1.1
3247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248     $self->{line_prev} = $self->{line};
3249     $self->{column_prev} = $self->{column};
3250     $self->{column}++;
3251     $self->{nc}
3252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253     } else {
3254     $self->{set_nc}->($self);
3255     }
3256    
3257     redo A;
3258     } elsif ($self->{nc} == 0x0053 or # S
3259     $self->{nc} == 0x0073) { # s
3260 wakaba 1.12
3261 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3262 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3276     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278    
3279     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280     $self->{ct}->{value} = ''; # ENTITY
3281    
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     } elsif ($self->{nc} == 0x0027 and # '
3294     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296    
3297     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298     $self->{ct}->{value} = ''; # ENTITY
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311 wakaba 1.16 } elsif ($self->{is_xml} and
3312     $self->{ct}->{type} == DOCTYPE_TOKEN and
3313     $self->{nc} == 0x005B) { # [
3314 wakaba 1.12
3315     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 wakaba 1.13 $self->{in_subset} = 1;
3318 wakaba 1.1
3319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320     $self->{line_prev} = $self->{line};
3321     $self->{column_prev} = $self->{column};
3322     $self->{column}++;
3323     $self->{nc}
3324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325     } else {
3326     $self->{set_nc}->($self);
3327     }
3328    
3329 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3330 wakaba 1.1 redo A;
3331     } else {
3332 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333    
3334     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335    
3336     $self->{ct}->{quirks} = 1;
3337     $self->{state} = BOGUS_DOCTYPE_STATE;
3338     } else {
3339    
3340     $self->{state} = BOGUS_MD_STATE;
3341     }
3342 wakaba 1.1
3343    
3344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345     $self->{line_prev} = $self->{line};
3346     $self->{column_prev} = $self->{column};
3347     $self->{column}++;
3348     $self->{nc}
3349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350     } else {
3351     $self->{set_nc}->($self);
3352     }
3353    
3354     redo A;
3355     }
3356     } elsif ($self->{state} == PUBLIC_STATE) {
3357     ## ASCII case-insensitive
3358     if ($self->{nc} == [
3359     undef,
3360     0x0055, # U
3361     0x0042, # B
3362     0x004C, # L
3363     0x0049, # I
3364 wakaba 1.12 ]->[length $self->{kwd}] or
3365 wakaba 1.1 $self->{nc} == [
3366     undef,
3367     0x0075, # u
3368     0x0062, # b
3369     0x006C, # l
3370     0x0069, # i
3371 wakaba 1.12 ]->[length $self->{kwd}]) {
3372 wakaba 1.1
3373     ## Stay in the state.
3374 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3375 wakaba 1.1
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3388 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3389     $self->{nc} == 0x0063)) { # c
3390 wakaba 1.12 if ($self->{is_xml} and
3391     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392    
3393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394     text => 'PUBLIC',
3395     line => $self->{line_prev},
3396     column => $self->{column_prev} - 4);
3397     } else {
3398    
3399     }
3400 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     } else {
3414 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 wakaba 1.1 line => $self->{line_prev},
3416 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418    
3419     $self->{ct}->{quirks} = 1;
3420     $self->{state} = BOGUS_DOCTYPE_STATE;
3421     } else {
3422    
3423     $self->{state} = BOGUS_MD_STATE;
3424     }
3425 wakaba 1.1 ## Reconsume.
3426     redo A;
3427     }
3428     } elsif ($self->{state} == SYSTEM_STATE) {
3429     ## ASCII case-insensitive
3430     if ($self->{nc} == [
3431     undef,
3432     0x0059, # Y
3433     0x0053, # S
3434     0x0054, # T
3435     0x0045, # E
3436 wakaba 1.12 ]->[length $self->{kwd}] or
3437 wakaba 1.1 $self->{nc} == [
3438     undef,
3439     0x0079, # y
3440     0x0073, # s
3441     0x0074, # t
3442     0x0065, # e
3443 wakaba 1.12 ]->[length $self->{kwd}]) {
3444 wakaba 1.1
3445     ## Stay in the state.
3446 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3447 wakaba 1.1
3448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449     $self->{line_prev} = $self->{line};
3450     $self->{column_prev} = $self->{column};
3451     $self->{column}++;
3452     $self->{nc}
3453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454     } else {
3455     $self->{set_nc}->($self);
3456     }
3457    
3458     redo A;
3459 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3460 wakaba 1.1 ($self->{nc} == 0x004D or # M
3461     $self->{nc} == 0x006D)) { # m
3462 wakaba 1.12 if ($self->{is_xml} and
3463     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464    
3465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466     text => 'SYSTEM',
3467     line => $self->{line_prev},
3468     column => $self->{column_prev} - 4);
3469     } else {
3470    
3471     }
3472 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473    
3474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475     $self->{line_prev} = $self->{line};
3476     $self->{column_prev} = $self->{column};
3477     $self->{column}++;
3478     $self->{nc}
3479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480     } else {
3481     $self->{set_nc}->($self);
3482     }
3483    
3484     redo A;
3485     } else {
3486 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 wakaba 1.1 line => $self->{line_prev},
3488 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490    
3491     $self->{ct}->{quirks} = 1;
3492     $self->{state} = BOGUS_DOCTYPE_STATE;
3493     } else {
3494    
3495     $self->{state} = BOGUS_MD_STATE;
3496     }
3497 wakaba 1.1 ## Reconsume.
3498     redo A;
3499     }
3500     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501     if ($is_space->{$self->{nc}}) {
3502    
3503     ## Stay in the state
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } elsif ($self->{nc} eq 0x0022) { # "
3517    
3518     $self->{ct}->{pubid} = ''; # DOCTYPE
3519     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520    
3521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522     $self->{line_prev} = $self->{line};
3523     $self->{column_prev} = $self->{column};
3524     $self->{column}++;
3525     $self->{nc}
3526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527     } else {
3528     $self->{set_nc}->($self);
3529     }
3530    
3531     redo A;
3532     } elsif ($self->{nc} eq 0x0027) { # '
3533    
3534     $self->{ct}->{pubid} = ''; # DOCTYPE
3535     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536    
3537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538     $self->{line_prev} = $self->{line};
3539     $self->{column_prev} = $self->{column};
3540     $self->{column}++;
3541     $self->{nc}
3542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543     } else {
3544     $self->{set_nc}->($self);
3545     }
3546    
3547     redo A;
3548     } elsif ($self->{nc} eq 0x003E) { # >
3549 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550    
3551     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552    
3553     $self->{state} = DATA_STATE;
3554     $self->{s_kwd} = '';
3555     $self->{ct}->{quirks} = 1;
3556     } else {
3557    
3558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559     }
3560 wakaba 1.1
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 wakaba 1.1 redo A;
3574     } elsif ($self->{nc} == -1) {
3575 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576    
3577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578     $self->{state} = DATA_STATE;
3579     $self->{s_kwd} = '';
3580     $self->{ct}->{quirks} = 1;
3581     } else {
3582    
3583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585     }
3586 wakaba 1.1
3587     ## reconsume
3588     return ($self->{ct}); # DOCTYPE
3589     redo A;
3590 wakaba 1.16 } elsif ($self->{is_xml} and
3591     $self->{ct}->{type} == DOCTYPE_TOKEN and
3592     $self->{nc} == 0x005B) { # [
3593 wakaba 1.12
3594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 wakaba 1.13 $self->{in_subset} = 1;
3598 wakaba 1.12
3599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600     $self->{line_prev} = $self->{line};
3601     $self->{column_prev} = $self->{column};
3602     $self->{column}++;
3603     $self->{nc}
3604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605     } else {
3606     $self->{set_nc}->($self);
3607     }
3608    
3609 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3610 wakaba 1.12 redo A;
3611 wakaba 1.1 } else {
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613    
3614 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615    
3616     $self->{ct}->{quirks} = 1;
3617     $self->{state} = BOGUS_DOCTYPE_STATE;
3618     } else {
3619    
3620     $self->{state} = BOGUS_MD_STATE;
3621     }
3622    
3623 wakaba 1.1
3624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625     $self->{line_prev} = $self->{line};
3626     $self->{column_prev} = $self->{column};
3627     $self->{column}++;
3628     $self->{nc}
3629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630     } else {
3631     $self->{set_nc}->($self);
3632     }
3633    
3634     redo A;
3635     }
3636     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637     if ($self->{nc} == 0x0022) { # "
3638    
3639     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640    
3641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642     $self->{line_prev} = $self->{line};
3643     $self->{column_prev} = $self->{column};
3644     $self->{column}++;
3645     $self->{nc}
3646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647     } else {
3648     $self->{set_nc}->($self);
3649     }
3650    
3651     redo A;
3652     } elsif ($self->{nc} == 0x003E) { # >
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654    
3655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656    
3657     $self->{state} = DATA_STATE;
3658     $self->{s_kwd} = '';
3659     $self->{ct}->{quirks} = 1;
3660     } else {
3661    
3662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663     }
3664    
3665 wakaba 1.1
3666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667     $self->{line_prev} = $self->{line};
3668     $self->{column_prev} = $self->{column};
3669     $self->{column}++;
3670     $self->{nc}
3671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672     } else {
3673     $self->{set_nc}->($self);
3674     }
3675    
3676 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 wakaba 1.1 redo A;
3678     } elsif ($self->{nc} == -1) {
3679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680    
3681 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682    
3683     $self->{state} = DATA_STATE;
3684     $self->{s_kwd} = '';
3685     $self->{ct}->{quirks} = 1;
3686     } else {
3687    
3688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689     }
3690    
3691     ## Reconsume.
3692 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3693     redo A;
3694     } else {
3695    
3696 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698     length $self->{ct}->{pubid});
3699    
3700     ## Stay in the state
3701    
3702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703     $self->{line_prev} = $self->{line};
3704     $self->{column_prev} = $self->{column};
3705     $self->{column}++;
3706     $self->{nc}
3707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708     } else {
3709     $self->{set_nc}->($self);
3710     }
3711    
3712     redo A;
3713     }
3714     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715     if ($self->{nc} == 0x0027) { # '
3716    
3717     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718    
3719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720     $self->{line_prev} = $self->{line};
3721     $self->{column_prev} = $self->{column};
3722     $self->{column}++;
3723     $self->{nc}
3724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725     } else {
3726     $self->{set_nc}->($self);
3727     }
3728    
3729     redo A;
3730     } elsif ($self->{nc} == 0x003E) { # >
3731     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732    
3733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734    
3735     $self->{state} = DATA_STATE;
3736     $self->{s_kwd} = '';
3737     $self->{ct}->{quirks} = 1;
3738     } else {
3739    
3740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741     }
3742    
3743 wakaba 1.1
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 wakaba 1.1 redo A;
3756     } elsif ($self->{nc} == -1) {
3757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758    
3759 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760    
3761     $self->{state} = DATA_STATE;
3762     $self->{s_kwd} = '';
3763     $self->{ct}->{quirks} = 1;
3764     } else {
3765    
3766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767     }
3768    
3769 wakaba 1.1 ## reconsume
3770 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 wakaba 1.1 redo A;
3772     } else {
3773    
3774 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776     length $self->{ct}->{pubid});
3777    
3778     ## Stay in the state
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     }
3792     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793     if ($is_space->{$self->{nc}}) {
3794    
3795     ## Stay in the state
3796    
3797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798     $self->{line_prev} = $self->{line};
3799     $self->{column_prev} = $self->{column};
3800     $self->{column}++;
3801     $self->{nc}
3802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803     } else {
3804     $self->{set_nc}->($self);
3805     }
3806    
3807     redo A;
3808     } elsif ($self->{nc} == 0x0022) { # "
3809    
3810 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812    
3813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814     $self->{line_prev} = $self->{line};
3815     $self->{column_prev} = $self->{column};
3816     $self->{column}++;
3817     $self->{nc}
3818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819     } else {
3820     $self->{set_nc}->($self);
3821     }
3822    
3823     redo A;
3824     } elsif ($self->{nc} == 0x0027) { # '
3825    
3826 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828    
3829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830     $self->{line_prev} = $self->{line};
3831     $self->{column_prev} = $self->{column};
3832     $self->{column}++;
3833     $self->{nc}
3834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835     } else {
3836     $self->{set_nc}->($self);
3837     }
3838    
3839     redo A;
3840     } elsif ($self->{nc} == 0x003E) { # >
3841 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842     if ($self->{is_xml}) {
3843    
3844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845     } else {
3846    
3847     }
3848     $self->{state} = DATA_STATE;
3849     $self->{s_kwd} = '';
3850 wakaba 1.12 } else {
3851 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852    
3853     } else {
3854    
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856     }
3857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 wakaba 1.12 }
3859 wakaba 1.16
3860 wakaba 1.1
3861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862     $self->{line_prev} = $self->{line};
3863     $self->{column_prev} = $self->{column};
3864     $self->{column}++;
3865     $self->{nc}
3866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867     } else {
3868     $self->{set_nc}->($self);
3869     }
3870    
3871 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 wakaba 1.1 redo A;
3873     } elsif ($self->{nc} == -1) {
3874 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875    
3876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877    
3878     $self->{state} = DATA_STATE;
3879     $self->{s_kwd} = '';
3880     $self->{ct}->{quirks} = 1;
3881     } else {
3882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884     }
3885 wakaba 1.1
3886     ## reconsume
3887 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 wakaba 1.1 redo A;
3889 wakaba 1.16 } elsif ($self->{is_xml} and
3890     $self->{ct}->{type} == DOCTYPE_TOKEN and
3891     $self->{nc} == 0x005B) { # [
3892 wakaba 1.12
3893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 wakaba 1.13 $self->{in_subset} = 1;
3897 wakaba 1.12
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3909 wakaba 1.12 redo A;
3910 wakaba 1.1 } else {
3911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912    
3913 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914    
3915     $self->{ct}->{quirks} = 1;
3916     $self->{state} = BOGUS_DOCTYPE_STATE;
3917     } else {
3918    
3919     $self->{state} = BOGUS_MD_STATE;
3920     }
3921    
3922 wakaba 1.1
3923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924     $self->{line_prev} = $self->{line};
3925     $self->{column_prev} = $self->{column};
3926     $self->{column}++;
3927     $self->{nc}
3928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929     } else {
3930     $self->{set_nc}->($self);
3931     }
3932    
3933     redo A;
3934     }
3935     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936     if ($is_space->{$self->{nc}}) {
3937    
3938     ## Stay in the state
3939    
3940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941     $self->{line_prev} = $self->{line};
3942     $self->{column_prev} = $self->{column};
3943     $self->{column}++;
3944     $self->{nc}
3945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946     } else {
3947     $self->{set_nc}->($self);
3948     }
3949    
3950     redo A;
3951     } elsif ($self->{nc} == 0x0022) { # "
3952    
3953     $self->{ct}->{sysid} = ''; # DOCTYPE
3954     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955    
3956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957     $self->{line_prev} = $self->{line};
3958     $self->{column_prev} = $self->{column};
3959     $self->{column}++;
3960     $self->{nc}
3961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962     } else {
3963     $self->{set_nc}->($self);
3964     }
3965    
3966     redo A;
3967     } elsif ($self->{nc} == 0x0027) { # '
3968    
3969     $self->{ct}->{sysid} = ''; # DOCTYPE
3970     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971    
3972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973     $self->{line_prev} = $self->{line};
3974     $self->{column_prev} = $self->{column};
3975     $self->{column}++;
3976     $self->{nc}
3977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978     } else {
3979     $self->{set_nc}->($self);
3980     }
3981    
3982     redo A;
3983     } elsif ($self->{nc} == 0x003E) { # >
3984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987     $self->{line_prev} = $self->{line};
3988     $self->{column_prev} = $self->{column};
3989     $self->{column}++;
3990     $self->{nc}
3991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992     } else {
3993     $self->{set_nc}->($self);
3994     }
3995    
3996    
3997 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998    
3999     $self->{state} = DATA_STATE;
4000     $self->{s_kwd} = '';
4001     $self->{ct}->{quirks} = 1;
4002     } else {
4003    
4004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005     }
4006 wakaba 1.1
4007 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 wakaba 1.1 redo A;
4009     } elsif ($self->{nc} == -1) {
4010 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011    
4012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013     $self->{state} = DATA_STATE;
4014     $self->{s_kwd} = '';
4015     $self->{ct}->{quirks} = 1;
4016     } else {
4017    
4018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020     }
4021 wakaba 1.1
4022     ## reconsume
4023 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 wakaba 1.1 redo A;
4025 wakaba 1.16 } elsif ($self->{is_xml} and
4026     $self->{ct}->{type} == DOCTYPE_TOKEN and
4027     $self->{nc} == 0x005B) { # [
4028 wakaba 1.12
4029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030    
4031     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 wakaba 1.13 $self->{in_subset} = 1;
4034 wakaba 1.12
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4046 wakaba 1.12 redo A;
4047 wakaba 1.1 } else {
4048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049    
4050 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051    
4052     $self->{ct}->{quirks} = 1;
4053     $self->{state} = BOGUS_DOCTYPE_STATE;
4054     } else {
4055    
4056     $self->{state} = BOGUS_MD_STATE;
4057     }
4058    
4059 wakaba 1.1
4060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061     $self->{line_prev} = $self->{line};
4062     $self->{column_prev} = $self->{column};
4063     $self->{column}++;
4064     $self->{nc}
4065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066     } else {
4067     $self->{set_nc}->($self);
4068     }
4069    
4070     redo A;
4071     }
4072     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073     if ($self->{nc} == 0x0022) { # "
4074    
4075     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076    
4077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078     $self->{line_prev} = $self->{line};
4079     $self->{column_prev} = $self->{column};
4080     $self->{column}++;
4081     $self->{nc}
4082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083     } else {
4084     $self->{set_nc}->($self);
4085     }
4086    
4087     redo A;
4088 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090    
4091 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092    
4093     $self->{state} = DATA_STATE;
4094     $self->{s_kwd} = '';
4095     $self->{ct}->{quirks} = 1;
4096     } else {
4097    
4098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099     }
4100    
4101 wakaba 1.1
4102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103     $self->{line_prev} = $self->{line};
4104     $self->{column_prev} = $self->{column};
4105     $self->{column}++;
4106     $self->{nc}
4107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108     } else {
4109     $self->{set_nc}->($self);
4110     }
4111    
4112 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 wakaba 1.1 redo A;
4114     } elsif ($self->{nc} == -1) {
4115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116    
4117 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118    
4119     $self->{state} = DATA_STATE;
4120     $self->{s_kwd} = '';
4121     $self->{ct}->{quirks} = 1;
4122     } else {
4123    
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125     }
4126    
4127 wakaba 1.1 ## reconsume
4128 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 wakaba 1.1 redo A;
4130     } else {
4131    
4132 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134     length $self->{ct}->{sysid});
4135    
4136     ## Stay in the state
4137    
4138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139     $self->{line_prev} = $self->{line};
4140     $self->{column_prev} = $self->{column};
4141     $self->{column}++;
4142     $self->{nc}
4143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144     } else {
4145     $self->{set_nc}->($self);
4146     }
4147    
4148     redo A;
4149     }
4150     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151     if ($self->{nc} == 0x0027) { # '
4152    
4153     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154    
4155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156     $self->{line_prev} = $self->{line};
4157     $self->{column_prev} = $self->{column};
4158     $self->{column}++;
4159     $self->{nc}
4160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161     } else {
4162     $self->{set_nc}->($self);
4163     }
4164    
4165     redo A;
4166 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167 wakaba 1.1
4168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169    
4170     $self->{state} = DATA_STATE;
4171 wakaba 1.5 $self->{s_kwd} = '';
4172 wakaba 1.1
4173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174     $self->{line_prev} = $self->{line};
4175     $self->{column_prev} = $self->{column};
4176     $self->{column}++;
4177     $self->{nc}
4178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179     } else {
4180     $self->{set_nc}->($self);
4181     }
4182    
4183    
4184     $self->{ct}->{quirks} = 1;
4185     return ($self->{ct}); # DOCTYPE
4186    
4187     redo A;
4188     } elsif ($self->{nc} == -1) {
4189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190    
4191 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192    
4193     $self->{state} = DATA_STATE;
4194     $self->{s_kwd} = '';
4195     $self->{ct}->{quirks} = 1;
4196     } else {
4197    
4198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199     }
4200    
4201 wakaba 1.1 ## reconsume
4202 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 wakaba 1.1 redo A;
4204     } else {
4205    
4206 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208     length $self->{ct}->{sysid});
4209    
4210     ## Stay in the state
4211    
4212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213     $self->{line_prev} = $self->{line};
4214     $self->{column_prev} = $self->{column};
4215     $self->{column}++;
4216     $self->{nc}
4217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218     } else {
4219     $self->{set_nc}->($self);
4220     }
4221    
4222     redo A;
4223     }
4224     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225     if ($is_space->{$self->{nc}}) {
4226 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227    
4228     $self->{state} = BEFORE_NDATA_STATE;
4229     } else {
4230    
4231     ## Stay in the state
4232     }
4233 wakaba 1.1
4234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235     $self->{line_prev} = $self->{line};
4236     $self->{column_prev} = $self->{column};
4237     $self->{column}++;
4238     $self->{nc}
4239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240     } else {
4241     $self->{set_nc}->($self);
4242     }
4243    
4244     redo A;
4245     } elsif ($self->{nc} == 0x003E) { # >
4246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247    
4248     $self->{state} = DATA_STATE;
4249     $self->{s_kwd} = '';
4250     } else {
4251    
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253     }
4254    
4255 wakaba 1.1
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 wakaba 1.1 redo A;
4268 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269     ($self->{nc} == 0x004E or # N
4270     $self->{nc} == 0x006E)) { # n
4271    
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273     $self->{state} = NDATA_STATE;
4274     $self->{kwd} = chr $self->{nc};
4275    
4276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277     $self->{line_prev} = $self->{line};
4278     $self->{column_prev} = $self->{column};
4279     $self->{column}++;
4280     $self->{nc}
4281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282     } else {
4283     $self->{set_nc}->($self);
4284     }
4285    
4286     redo A;
4287 wakaba 1.1 } elsif ($self->{nc} == -1) {
4288 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289    
4290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291     $self->{state} = DATA_STATE;
4292     $self->{s_kwd} = '';
4293     $self->{ct}->{quirks} = 1;
4294     } else {
4295    
4296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298     }
4299    
4300 wakaba 1.1 ## reconsume
4301 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 wakaba 1.1 redo A;
4303 wakaba 1.16 } elsif ($self->{is_xml} and
4304     $self->{ct}->{type} == DOCTYPE_TOKEN and
4305     $self->{nc} == 0x005B) { # [
4306 wakaba 1.12
4307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 wakaba 1.13 $self->{in_subset} = 1;
4310 wakaba 1.12
4311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312     $self->{line_prev} = $self->{line};
4313     $self->{column_prev} = $self->{column};
4314     $self->{column}++;
4315     $self->{nc}
4316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317     } else {
4318     $self->{set_nc}->($self);
4319     }
4320    
4321 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4322 wakaba 1.12 redo A;
4323 wakaba 1.1 } else {
4324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325    
4326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327    
4328     #$self->{ct}->{quirks} = 1;
4329     $self->{state} = BOGUS_DOCTYPE_STATE;
4330     } else {
4331    
4332     $self->{state} = BOGUS_MD_STATE;
4333     }
4334    
4335 wakaba 1.1
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     }
4348 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349     if ($is_space->{$self->{nc}}) {
4350    
4351     ## Stay in the state.
4352    
4353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354     $self->{line_prev} = $self->{line};
4355     $self->{column_prev} = $self->{column};
4356     $self->{column}++;
4357     $self->{nc}
4358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359     } else {
4360     $self->{set_nc}->($self);
4361     }
4362    
4363     redo A;
4364     } elsif ($self->{nc} == 0x003E) { # >
4365    
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367    
4368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369     $self->{line_prev} = $self->{line};
4370     $self->{column_prev} = $self->{column};
4371     $self->{column}++;
4372     $self->{nc}
4373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374     } else {
4375     $self->{set_nc}->($self);
4376     }
4377    
4378     return ($self->{ct}); # ENTITY
4379     redo A;
4380     } elsif ($self->{nc} == 0x004E or # N
4381     $self->{nc} == 0x006E) { # n
4382    
4383     $self->{state} = NDATA_STATE;
4384     $self->{kwd} = chr $self->{nc};
4385    
4386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387     $self->{line_prev} = $self->{line};
4388     $self->{column_prev} = $self->{column};
4389     $self->{column}++;
4390     $self->{nc}
4391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392     } else {
4393     $self->{set_nc}->($self);
4394     }
4395    
4396     redo A;
4397     } elsif ($self->{nc} == -1) {
4398    
4399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401     ## reconsume
4402     return ($self->{ct}); # ENTITY
4403     redo A;
4404     } else {
4405    
4406     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407     $self->{state} = BOGUS_MD_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     }
4421 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422     if ($self->{nc} == 0x003E) { # >
4423    
4424     $self->{state} = DATA_STATE;
4425 wakaba 1.5 $self->{s_kwd} = '';
4426 wakaba 1.1
4427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428     $self->{line_prev} = $self->{line};
4429     $self->{column_prev} = $self->{column};
4430     $self->{column}++;
4431     $self->{nc}
4432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433     } else {
4434     $self->{set_nc}->($self);
4435     }
4436    
4437    
4438     return ($self->{ct}); # DOCTYPE
4439    
4440     redo A;
4441 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442 wakaba 1.13
4443     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445     $self->{in_subset} = 1;
4446    
4447 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448     $self->{line_prev} = $self->{line};
4449     $self->{column_prev} = $self->{column};
4450     $self->{column}++;
4451     $self->{nc}
4452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453     } else {
4454     $self->{set_nc}->($self);
4455     }
4456    
4457 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4458     redo A;
4459 wakaba 1.1 } elsif ($self->{nc} == -1) {
4460    
4461     $self->{state} = DATA_STATE;
4462 wakaba 1.5 $self->{s_kwd} = '';
4463 wakaba 1.1 ## reconsume
4464    
4465     return ($self->{ct}); # DOCTYPE
4466    
4467     redo A;
4468     } else {
4469    
4470     my $s = '';
4471 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4472 wakaba 1.1
4473     ## Stay in the state
4474    
4475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476     $self->{line_prev} = $self->{line};
4477     $self->{column_prev} = $self->{column};
4478     $self->{column}++;
4479     $self->{nc}
4480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481     } else {
4482     $self->{set_nc}->($self);
4483     }
4484    
4485     redo A;
4486     }
4487     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488     ## NOTE: "CDATA section state" in the state is jointly implemented
4489     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490     ## and |CDATA_SECTION_MSE2_STATE|.
4491 wakaba 1.10
4492     ## XML5: "CDATA state".
4493 wakaba 1.1
4494     if ($self->{nc} == 0x005D) { # ]
4495    
4496     $self->{state} = CDATA_SECTION_MSE1_STATE;
4497    
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510 wakaba 1.6 if ($self->{is_xml}) {
4511 wakaba 1.8
4512 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 wakaba 1.8 } else {
4514    
4515 wakaba 1.6 }
4516    
4517 wakaba 1.1 $self->{state} = DATA_STATE;
4518 wakaba 1.5 $self->{s_kwd} = '';
4519 wakaba 1.10 ## Reconsume.
4520 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4521    
4522     return ($self->{ct}); # character
4523     } else {
4524    
4525     ## No token to emit. $self->{ct} is discarded.
4526     }
4527     redo A;
4528     } else {
4529    
4530     $self->{ct}->{data} .= chr $self->{nc};
4531     $self->{read_until}->($self->{ct}->{data},
4532     q<]>,
4533     length $self->{ct}->{data});
4534    
4535     ## Stay in the state.
4536    
4537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538     $self->{line_prev} = $self->{line};
4539     $self->{column_prev} = $self->{column};
4540     $self->{column}++;
4541     $self->{nc}
4542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543     } else {
4544     $self->{set_nc}->($self);
4545     }
4546    
4547     redo A;
4548     }
4549    
4550     ## ISSUE: "text tokens" in spec.
4551     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 wakaba 1.10 ## XML5: "CDATA bracket state".
4553    
4554 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4555    
4556     $self->{state} = CDATA_SECTION_MSE2_STATE;
4557    
4558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559     $self->{line_prev} = $self->{line};
4560     $self->{column_prev} = $self->{column};
4561     $self->{column}++;
4562     $self->{nc}
4563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564     } else {
4565     $self->{set_nc}->($self);
4566     }
4567    
4568     redo A;
4569     } else {
4570    
4571 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 wakaba 1.1 $self->{ct}->{data} .= ']';
4573 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 wakaba 1.1 ## Reconsume.
4575     redo A;
4576     }
4577     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 wakaba 1.10 ## XML5: "CDATA end state".
4579    
4580 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4581     $self->{state} = DATA_STATE;
4582 wakaba 1.5 $self->{s_kwd} = '';
4583 wakaba 1.1
4584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585     $self->{line_prev} = $self->{line};
4586     $self->{column_prev} = $self->{column};
4587     $self->{column}++;
4588     $self->{nc}
4589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590     } else {
4591     $self->{set_nc}->($self);
4592     }
4593    
4594     if (length $self->{ct}->{data}) { # character
4595    
4596     return ($self->{ct}); # character
4597     } else {
4598    
4599     ## No token to emit. $self->{ct} is discarded.
4600     }
4601     redo A;
4602     } elsif ($self->{nc} == 0x005D) { # ]
4603     # character
4604     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605     ## Stay in the state.
4606    
4607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608     $self->{line_prev} = $self->{line};
4609     $self->{column_prev} = $self->{column};
4610     $self->{column}++;
4611     $self->{nc}
4612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613     } else {
4614     $self->{set_nc}->($self);
4615     }
4616    
4617     redo A;
4618     } else {
4619    
4620     $self->{ct}->{data} .= ']]'; # character
4621     $self->{state} = CDATA_SECTION_STATE;
4622 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4623 wakaba 1.1 redo A;
4624     }
4625     } elsif ($self->{state} == ENTITY_STATE) {
4626     if ($is_space->{$self->{nc}} or
4627     {
4628     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629     $self->{entity_add} => 1,
4630     }->{$self->{nc}}) {
4631 wakaba 1.22 if ($self->{is_xml}) {
4632    
4633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4634     line => $self->{line_prev},
4635     column => $self->{column_prev}
4636     + ($self->{nc} == -1 ? 1 : 0));
4637     } else {
4638    
4639     ## No error
4640     }
4641 wakaba 1.1 ## Don't consume
4642     ## Return nothing.
4643     #
4644     } elsif ($self->{nc} == 0x0023) { # #
4645    
4646     $self->{state} = ENTITY_HASH_STATE;
4647 wakaba 1.12 $self->{kwd} = '#';
4648 wakaba 1.1
4649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650     $self->{line_prev} = $self->{line};
4651     $self->{column_prev} = $self->{column};
4652     $self->{column}++;
4653     $self->{nc}
4654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655     } else {
4656     $self->{set_nc}->($self);
4657     }
4658    
4659     redo A;
4660 wakaba 1.22 } elsif ($self->{is_xml} or
4661     (0x0041 <= $self->{nc} and
4662 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4663     (0x0061 <= $self->{nc} and
4664     $self->{nc} <= 0x007A)) { # a..z
4665    
4666     require Whatpm::_NamedEntityList;
4667     $self->{state} = ENTITY_NAME_STATE;
4668 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4669     $self->{entity__value} = $self->{kwd};
4670 wakaba 1.1 $self->{entity__match} = 0;
4671    
4672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673     $self->{line_prev} = $self->{line};
4674     $self->{column_prev} = $self->{column};
4675     $self->{column}++;
4676     $self->{nc}
4677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678     } else {
4679     $self->{set_nc}->($self);
4680     }
4681    
4682     redo A;
4683     } else {
4684    
4685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4686     ## Return nothing.
4687     #
4688     }
4689    
4690     ## NOTE: No character is consumed by the "consume a character
4691     ## reference" algorithm. In other word, there is an "&" character
4692     ## that does not introduce a character reference, which would be
4693     ## appended to the parent element or the attribute value in later
4694     ## process of the tokenizer.
4695    
4696     if ($self->{prev_state} == DATA_STATE) {
4697    
4698     $self->{state} = $self->{prev_state};
4699 wakaba 1.5 $self->{s_kwd} = '';
4700 wakaba 1.1 ## Reconsume.
4701     return ({type => CHARACTER_TOKEN, data => '&',
4702     line => $self->{line_prev},
4703     column => $self->{column_prev},
4704     });
4705     redo A;
4706     } else {
4707    
4708     $self->{ca}->{value} .= '&';
4709     $self->{state} = $self->{prev_state};
4710 wakaba 1.5 $self->{s_kwd} = '';
4711 wakaba 1.1 ## Reconsume.
4712     redo A;
4713     }
4714     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4715 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4716 wakaba 1.1
4717     $self->{state} = HEXREF_X_STATE;
4718 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4719 wakaba 1.1
4720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4721     $self->{line_prev} = $self->{line};
4722     $self->{column_prev} = $self->{column};
4723     $self->{column}++;
4724     $self->{nc}
4725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4726     } else {
4727     $self->{set_nc}->($self);
4728     }
4729    
4730     redo A;
4731 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4732    
4733     if ($self->{is_xml}) {
4734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4735     }
4736     $self->{state} = HEXREF_X_STATE;
4737     $self->{kwd} .= chr $self->{nc};
4738    
4739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4740     $self->{line_prev} = $self->{line};
4741     $self->{column_prev} = $self->{column};
4742     $self->{column}++;
4743     $self->{nc}
4744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4745     } else {
4746     $self->{set_nc}->($self);
4747     }
4748    
4749     redo A;
4750 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4751     $self->{nc} <= 0x0039) { # 0..9
4752    
4753     $self->{state} = NCR_NUM_STATE;
4754 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4755 wakaba 1.1
4756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4757     $self->{line_prev} = $self->{line};
4758     $self->{column_prev} = $self->{column};
4759     $self->{column}++;
4760     $self->{nc}
4761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4762     } else {
4763     $self->{set_nc}->($self);
4764     }
4765    
4766     redo A;
4767     } else {
4768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4769     line => $self->{line_prev},
4770     column => $self->{column_prev} - 1);
4771    
4772     ## NOTE: According to the spec algorithm, nothing is returned,
4773     ## and then "&#" is appended to the parent element or the attribute
4774     ## value in the later processing.
4775    
4776     if ($self->{prev_state} == DATA_STATE) {
4777    
4778     $self->{state} = $self->{prev_state};
4779 wakaba 1.5 $self->{s_kwd} = '';
4780 wakaba 1.1 ## Reconsume.
4781     return ({type => CHARACTER_TOKEN,
4782     data => '&#',
4783     line => $self->{line_prev},
4784     column => $self->{column_prev} - 1,
4785     });
4786     redo A;
4787     } else {
4788    
4789     $self->{ca}->{value} .= '&#';
4790     $self->{state} = $self->{prev_state};
4791 wakaba 1.5 $self->{s_kwd} = '';
4792 wakaba 1.1 ## Reconsume.
4793     redo A;
4794     }
4795     }
4796     } elsif ($self->{state} == NCR_NUM_STATE) {
4797     if (0x0030 <= $self->{nc} and
4798     $self->{nc} <= 0x0039) { # 0..9
4799    
4800 wakaba 1.12 $self->{kwd} *= 10;
4801     $self->{kwd} += $self->{nc} - 0x0030;
4802 wakaba 1.1
4803     ## Stay in the state.
4804    
4805     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806     $self->{line_prev} = $self->{line};
4807     $self->{column_prev} = $self->{column};
4808     $self->{column}++;
4809     $self->{nc}
4810     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811     } else {
4812     $self->{set_nc}->($self);
4813     }
4814    
4815     redo A;
4816     } elsif ($self->{nc} == 0x003B) { # ;
4817    
4818    
4819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4820     $self->{line_prev} = $self->{line};
4821     $self->{column_prev} = $self->{column};
4822     $self->{column}++;
4823     $self->{nc}
4824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4825     } else {
4826     $self->{set_nc}->($self);
4827     }
4828    
4829     #
4830     } else {
4831    
4832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4833     ## Reconsume.
4834     #
4835     }
4836    
4837 wakaba 1.12 my $code = $self->{kwd};
4838 wakaba 1.1 my $l = $self->{line_prev};
4839     my $c = $self->{column_prev};
4840     if ($charref_map->{$code}) {
4841    
4842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4843     text => (sprintf 'U+%04X', $code),
4844     line => $l, column => $c);
4845     $code = $charref_map->{$code};
4846     } elsif ($code > 0x10FFFF) {
4847    
4848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4849     text => (sprintf 'U-%08X', $code),
4850     line => $l, column => $c);
4851     $code = 0xFFFD;
4852     }
4853    
4854     if ($self->{prev_state} == DATA_STATE) {
4855    
4856     $self->{state} = $self->{prev_state};
4857 wakaba 1.5 $self->{s_kwd} = '';
4858 wakaba 1.1 ## Reconsume.
4859     return ({type => CHARACTER_TOKEN, data => chr $code,
4860 wakaba 1.7 has_reference => 1,
4861 wakaba 1.1 line => $l, column => $c,
4862     });
4863     redo A;
4864     } else {
4865    
4866     $self->{ca}->{value} .= chr $code;
4867     $self->{ca}->{has_reference} = 1;
4868     $self->{state} = $self->{prev_state};
4869 wakaba 1.5 $self->{s_kwd} = '';
4870 wakaba 1.1 ## Reconsume.
4871     redo A;
4872     }
4873     } elsif ($self->{state} == HEXREF_X_STATE) {
4874     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4875     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4876     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4877     # 0..9, A..F, a..f
4878    
4879     $self->{state} = HEXREF_HEX_STATE;
4880 wakaba 1.12 $self->{kwd} = 0;
4881 wakaba 1.1 ## Reconsume.
4882     redo A;
4883     } else {
4884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4885     line => $self->{line_prev},
4886     column => $self->{column_prev} - 2);
4887    
4888     ## NOTE: According to the spec algorithm, nothing is returned,
4889     ## and then "&#" followed by "X" or "x" is appended to the parent
4890     ## element or the attribute value in the later processing.
4891    
4892     if ($self->{prev_state} == DATA_STATE) {
4893    
4894     $self->{state} = $self->{prev_state};
4895 wakaba 1.5 $self->{s_kwd} = '';
4896 wakaba 1.1 ## Reconsume.
4897     return ({type => CHARACTER_TOKEN,
4898 wakaba 1.12 data => '&' . $self->{kwd},
4899 wakaba 1.1 line => $self->{line_prev},
4900 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4901 wakaba 1.1 });
4902     redo A;
4903     } else {
4904    
4905 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4906 wakaba 1.1 $self->{state} = $self->{prev_state};
4907 wakaba 1.5 $self->{s_kwd} = '';
4908 wakaba 1.1 ## Reconsume.
4909     redo A;
4910     }
4911     }
4912     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4913     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4914     # 0..9
4915    
4916 wakaba 1.12 $self->{kwd} *= 0x10;
4917     $self->{kwd} += $self->{nc} - 0x0030;
4918 wakaba 1.1 ## Stay in the state.
4919    
4920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4921     $self->{line_prev} = $self->{line};
4922     $self->{column_prev} = $self->{column};
4923     $self->{column}++;
4924     $self->{nc}
4925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4926     } else {
4927     $self->{set_nc}->($self);
4928     }
4929    
4930     redo A;
4931     } elsif (0x0061 <= $self->{nc} and
4932     $self->{nc} <= 0x0066) { # a..f
4933    
4934 wakaba 1.12 $self->{kwd} *= 0x10;
4935     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4936 wakaba 1.1 ## Stay in the state.
4937    
4938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4939     $self->{line_prev} = $self->{line};
4940     $self->{column_prev} = $self->{column};
4941     $self->{column}++;
4942     $self->{nc}
4943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4944     } else {
4945     $self->{set_nc}->($self);
4946     }
4947    
4948     redo A;
4949     } elsif (0x0041 <= $self->{nc} and
4950     $self->{nc} <= 0x0046) { # A..F
4951    
4952 wakaba 1.12 $self->{kwd} *= 0x10;
4953     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4954 wakaba 1.1 ## Stay in the state.
4955    
4956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957     $self->{line_prev} = $self->{line};
4958     $self->{column_prev} = $self->{column};
4959     $self->{column}++;
4960     $self->{nc}
4961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962     } else {
4963     $self->{set_nc}->($self);
4964     }
4965    
4966     redo A;
4967     } elsif ($self->{nc} == 0x003B) { # ;
4968    
4969    
4970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4971     $self->{line_prev} = $self->{line};
4972     $self->{column_prev} = $self->{column};
4973     $self->{column}++;
4974     $self->{nc}
4975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4976     } else {
4977     $self->{set_nc}->($self);
4978     }
4979    
4980     #
4981     } else {
4982    
4983     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4984     line => $self->{line},
4985     column => $self->{column});
4986     ## Reconsume.
4987     #
4988     }
4989    
4990 wakaba 1.12 my $code = $self->{kwd};
4991 wakaba 1.1 my $l = $self->{line_prev};
4992     my $c = $self->{column_prev};
4993     if ($charref_map->{$code}) {
4994    
4995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4996     text => (sprintf 'U+%04X', $code),
4997     line => $l, column => $c);
4998     $code = $charref_map->{$code};
4999     } elsif ($code > 0x10FFFF) {
5000    
5001     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5002     text => (sprintf 'U-%08X', $code),
5003     line => $l, column => $c);
5004     $code = 0xFFFD;
5005     }
5006    
5007     if ($self->{prev_state} == DATA_STATE) {
5008    
5009     $self->{state} = $self->{prev_state};
5010 wakaba 1.5 $self->{s_kwd} = '';
5011 wakaba 1.1 ## Reconsume.
5012     return ({type => CHARACTER_TOKEN, data => chr $code,
5013 wakaba 1.7 has_reference => 1,
5014 wakaba 1.1 line => $l, column => $c,
5015     });
5016     redo A;
5017     } else {
5018    
5019     $self->{ca}->{value} .= chr $code;
5020     $self->{ca}->{has_reference} = 1;
5021     $self->{state} = $self->{prev_state};
5022 wakaba 1.5 $self->{s_kwd} = '';
5023 wakaba 1.1 ## Reconsume.
5024     redo A;
5025     }
5026     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5027 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5028     $self->{nc} <= 0x005A) or # x
5029     (0x0061 <= $self->{nc} and # a
5030     $self->{nc} <= 0x007A) or # z
5031     (0x0030 <= $self->{nc} and # 0
5032     $self->{nc} <= 0x0039) or # 9
5033 wakaba 1.22 $self->{nc} == 0x003B or # ;
5034     ($self->{is_xml} and
5035     not ($is_space->{$self->{nc}} or
5036     {
5037     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5038     $self->{entity_add} => 1,
5039     }->{$self->{nc}}))) {
5040 wakaba 1.1 our $EntityChar;
5041 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5042 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5043     $self->{ge}->{$self->{kwd}}) {
5044 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5045 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5046     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5047    
5048     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5049     } else {
5050     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5051    
5052     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5053     value => $self->{kwd});
5054     } else {
5055    
5056     }
5057     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5058     }
5059     } else {
5060     if ($self->{is_xml}) {
5061    
5062     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5063     value => $self->{kwd},
5064     level => {
5065     'amp;' => $self->{level}->{warn},
5066     'quot;' => $self->{level}->{warn},
5067     'lt;' => $self->{level}->{warn},
5068     'gt;' => $self->{level}->{warn},
5069     'apos;' => $self->{level}->{warn},
5070     }->{$self->{kwd}} ||
5071     $self->{level}->{must});
5072     } else {
5073    
5074     }
5075     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5076     }
5077 wakaba 1.1 $self->{entity__match} = 1;
5078    
5079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5080     $self->{line_prev} = $self->{line};
5081     $self->{column_prev} = $self->{column};
5082     $self->{column}++;
5083     $self->{nc}
5084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5085     } else {
5086     $self->{set_nc}->($self);
5087     }
5088    
5089     #
5090     } else {
5091    
5092 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5093 wakaba 1.1 $self->{entity__match} = -1;
5094     ## Stay in the state.
5095    
5096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5097     $self->{line_prev} = $self->{line};
5098     $self->{column_prev} = $self->{column};
5099     $self->{column}++;
5100     $self->{nc}
5101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5102     } else {
5103     $self->{set_nc}->($self);
5104     }
5105    
5106     redo A;
5107     }
5108     } else {
5109    
5110     $self->{entity__value} .= chr $self->{nc};
5111     $self->{entity__match} *= 2;
5112     ## Stay in the state.
5113    
5114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5115     $self->{line_prev} = $self->{line};
5116     $self->{column_prev} = $self->{column};
5117     $self->{column}++;
5118     $self->{nc}
5119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5120     } else {
5121     $self->{set_nc}->($self);
5122     }
5123    
5124     redo A;
5125     }
5126     }
5127    
5128     my $data;
5129     my $has_ref;
5130     if ($self->{entity__match} > 0) {
5131    
5132     $data = $self->{entity__value};
5133     $has_ref = 1;
5134     #
5135     } elsif ($self->{entity__match} < 0) {
5136     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5137     if ($self->{prev_state} != DATA_STATE and # in attribute
5138     $self->{entity__match} < -1) {
5139    
5140 wakaba 1.12 $data = '&' . $self->{kwd};
5141 wakaba 1.1 #
5142     } else {
5143    
5144     $data = $self->{entity__value};
5145     $has_ref = 1;
5146     #
5147     }
5148     } else {
5149    
5150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5151     line => $self->{line_prev},
5152 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5153     $data = '&' . $self->{kwd};
5154 wakaba 1.1 #
5155     }
5156    
5157     ## NOTE: In these cases, when a character reference is found,
5158     ## it is consumed and a character token is returned, or, otherwise,
5159     ## nothing is consumed and returned, according to the spec algorithm.
5160     ## In this implementation, anything that has been examined by the
5161     ## tokenizer is appended to the parent element or the attribute value
5162     ## as string, either literal string when no character reference or
5163     ## entity-replaced string otherwise, in this stage, since any characters
5164     ## that would not be consumed are appended in the data state or in an
5165     ## appropriate attribute value state anyway.
5166    
5167     if ($self->{prev_state} == DATA_STATE) {
5168    
5169     $self->{state} = $self->{prev_state};
5170 wakaba 1.5 $self->{s_kwd} = '';
5171 wakaba 1.1 ## Reconsume.
5172     return ({type => CHARACTER_TOKEN,
5173     data => $data,
5174 wakaba 1.7 has_reference => $has_ref,
5175 wakaba 1.1 line => $self->{line_prev},
5176 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5177 wakaba 1.1 });
5178     redo A;
5179     } else {
5180    
5181     $self->{ca}->{value} .= $data;
5182     $self->{ca}->{has_reference} = 1 if $has_ref;
5183     $self->{state} = $self->{prev_state};
5184 wakaba 1.5 $self->{s_kwd} = '';
5185 wakaba 1.1 ## Reconsume.
5186     redo A;
5187     }
5188 wakaba 1.8
5189     ## XML-only states
5190    
5191     } elsif ($self->{state} == PI_STATE) {
5192 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5193    
5194 wakaba 1.8 if ($is_space->{$self->{nc}} or
5195 wakaba 1.14 $self->{nc} == 0x003F or # ?
5196 wakaba 1.8 $self->{nc} == -1) {
5197 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5198     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5199     ## "DOCTYPE pi state": Parse error, switch to the "data
5200     ## state".
5201 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5202     line => $self->{line_prev},
5203     column => $self->{column_prev}
5204     - 1 * ($self->{nc} != -1));
5205     $self->{state} = BOGUS_COMMENT_STATE;
5206     ## Reconsume.
5207     $self->{ct} = {type => COMMENT_TOKEN,
5208     data => '?',
5209     line => $self->{line_prev},
5210     column => $self->{column_prev}
5211     - 1 * ($self->{nc} != -1),
5212     };
5213     redo A;
5214     } else {
5215 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5216 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5217     target => chr $self->{nc},
5218     data => '',
5219     line => $self->{line_prev},
5220     column => $self->{column_prev} - 1,
5221     };
5222     $self->{state} = PI_TARGET_STATE;
5223    
5224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5225     $self->{line_prev} = $self->{line};
5226     $self->{column_prev} = $self->{column};
5227     $self->{column}++;
5228     $self->{nc}
5229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5230     } else {
5231     $self->{set_nc}->($self);
5232     }
5233    
5234     redo A;
5235     }
5236     } elsif ($self->{state} == PI_TARGET_STATE) {
5237     if ($is_space->{$self->{nc}}) {
5238     $self->{state} = PI_TARGET_AFTER_STATE;
5239    
5240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5241     $self->{line_prev} = $self->{line};
5242     $self->{column_prev} = $self->{column};
5243     $self->{column}++;
5244     $self->{nc}
5245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5246     } else {
5247     $self->{set_nc}->($self);
5248     }
5249    
5250     redo A;
5251     } elsif ($self->{nc} == -1) {
5252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5253 wakaba 1.13 if ($self->{in_subset}) {
5254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5255     } else {
5256     $self->{state} = DATA_STATE;
5257     $self->{s_kwd} = '';
5258     }
5259 wakaba 1.8 ## Reconsume.
5260     return ($self->{ct}); # pi
5261     redo A;
5262     } elsif ($self->{nc} == 0x003F) { # ?
5263     $self->{state} = PI_AFTER_STATE;
5264    
5265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266     $self->{line_prev} = $self->{line};
5267     $self->{column_prev} = $self->{column};
5268     $self->{column}++;
5269     $self->{nc}
5270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271     } else {
5272     $self->{set_nc}->($self);
5273     }
5274    
5275     redo A;
5276     } else {
5277     ## XML5: typo ("tag name" -> "target")
5278     $self->{ct}->{target} .= chr $self->{nc}; # pi
5279    
5280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281     $self->{line_prev} = $self->{line};
5282     $self->{column_prev} = $self->{column};
5283     $self->{column}++;
5284     $self->{nc}
5285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286     } else {
5287     $self->{set_nc}->($self);
5288     }
5289    
5290     redo A;
5291     }
5292     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5293     if ($is_space->{$self->{nc}}) {
5294     ## Stay in the state.
5295    
5296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5297     $self->{line_prev} = $self->{line};
5298     $self->{column_prev} = $self->{column};
5299     $self->{column}++;
5300     $self->{nc}
5301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5302     } else {
5303     $self->{set_nc}->($self);
5304     }
5305    
5306     redo A;
5307     } else {
5308     $self->{state} = PI_DATA_STATE;
5309     ## Reprocess.
5310     redo A;
5311     }
5312     } elsif ($self->{state} == PI_DATA_STATE) {
5313     if ($self->{nc} == 0x003F) { # ?
5314     $self->{state} = PI_DATA_AFTER_STATE;
5315    
5316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5317     $self->{line_prev} = $self->{line};
5318     $self->{column_prev} = $self->{column};
5319     $self->{column}++;
5320     $self->{nc}
5321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5322     } else {
5323     $self->{set_nc}->($self);
5324     }
5325    
5326     redo A;
5327     } elsif ($self->{nc} == -1) {
5328     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5329 wakaba 1.13 if ($self->{in_subset}) {
5330 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5331 wakaba 1.13 } else {
5332     $self->{state} = DATA_STATE;
5333     $self->{s_kwd} = '';
5334     }
5335 wakaba 1.8 ## Reprocess.
5336     return ($self->{ct}); # pi
5337     redo A;
5338     } else {
5339     $self->{ct}->{data} .= chr $self->{nc}; # pi
5340     $self->{read_until}->($self->{ct}->{data}, q[?],
5341     length $self->{ct}->{data});
5342     ## Stay in the state.
5343    
5344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5345     $self->{line_prev} = $self->{line};
5346     $self->{column_prev} = $self->{column};
5347     $self->{column}++;
5348     $self->{nc}
5349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5350     } else {
5351     $self->{set_nc}->($self);
5352     }
5353    
5354     ## Reprocess.
5355     redo A;
5356     }
5357     } elsif ($self->{state} == PI_AFTER_STATE) {
5358 wakaba 1.14 ## XML5: Part of "Pi after state".
5359    
5360 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5361 wakaba 1.13 if ($self->{in_subset}) {
5362     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5363     } else {
5364     $self->{state} = DATA_STATE;
5365     $self->{s_kwd} = '';
5366     }
5367 wakaba 1.8
5368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5369     $self->{line_prev} = $self->{line};
5370     $self->{column_prev} = $self->{column};
5371     $self->{column}++;
5372     $self->{nc}
5373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5374     } else {
5375     $self->{set_nc}->($self);
5376     }
5377    
5378     return ($self->{ct}); # pi
5379     redo A;
5380     } elsif ($self->{nc} == 0x003F) { # ?
5381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5382     line => $self->{line_prev},
5383     column => $self->{column_prev}); ## XML5: no error
5384     $self->{ct}->{data} .= '?';
5385     $self->{state} = PI_DATA_AFTER_STATE;
5386    
5387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5388     $self->{line_prev} = $self->{line};
5389     $self->{column_prev} = $self->{column};
5390     $self->{column}++;
5391     $self->{nc}
5392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5393     } else {
5394     $self->{set_nc}->($self);
5395     }
5396    
5397     redo A;
5398     } else {
5399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5400     line => $self->{line_prev},
5401     column => $self->{column_prev}
5402     + 1 * ($self->{nc} == -1)); ## XML5: no error
5403     $self->{ct}->{data} .= '?'; ## XML5: not appended
5404     $self->{state} = PI_DATA_STATE;
5405     ## Reprocess.
5406     redo A;
5407     }
5408     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5409 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5410    
5411 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5412 wakaba 1.13 if ($self->{in_subset}) {
5413     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5414     } else {
5415     $self->{state} = DATA_STATE;
5416     $self->{s_kwd} = '';
5417     }
5418 wakaba 1.8
5419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420     $self->{line_prev} = $self->{line};
5421     $self->{column_prev} = $self->{column};
5422     $self->{column}++;
5423     $self->{nc}
5424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425     } else {
5426     $self->{set_nc}->($self);
5427     }
5428    
5429     return ($self->{ct}); # pi
5430     redo A;
5431     } elsif ($self->{nc} == 0x003F) { # ?
5432     $self->{ct}->{data} .= '?';
5433     ## Stay in the state.
5434    
5435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436     $self->{line_prev} = $self->{line};
5437     $self->{column_prev} = $self->{column};
5438     $self->{column}++;
5439     $self->{nc}
5440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441     } else {
5442     $self->{set_nc}->($self);
5443     }
5444    
5445     redo A;
5446     } else {
5447     $self->{ct}->{data} .= '?'; ## XML5: not appended
5448     $self->{state} = PI_DATA_STATE;
5449     ## Reprocess.
5450     redo A;
5451     }
5452 wakaba 1.12
5453     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5454     if ($self->{nc} == 0x003C) { # <
5455 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5456 wakaba 1.12
5457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458     $self->{line_prev} = $self->{line};
5459     $self->{column_prev} = $self->{column};
5460     $self->{column}++;
5461     $self->{nc}
5462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463     } else {
5464     $self->{set_nc}->($self);
5465     }
5466    
5467     redo A;
5468     } elsif ($self->{nc} == 0x0025) { # %
5469     ## XML5: Not defined yet.
5470    
5471     ## TODO:
5472    
5473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474     $self->{line_prev} = $self->{line};
5475     $self->{column_prev} = $self->{column};
5476     $self->{column}++;
5477     $self->{nc}
5478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479     } else {
5480     $self->{set_nc}->($self);
5481     }
5482    
5483     redo A;
5484     } elsif ($self->{nc} == 0x005D) { # ]
5485 wakaba 1.13 delete $self->{in_subset};
5486 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5487    
5488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5489     $self->{line_prev} = $self->{line};
5490     $self->{column_prev} = $self->{column};
5491     $self->{column}++;
5492     $self->{nc}
5493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5494     } else {
5495     $self->{set_nc}->($self);
5496     }
5497    
5498     redo A;
5499     } elsif ($is_space->{$self->{nc}}) {
5500     ## Stay in the state.
5501    
5502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503     $self->{line_prev} = $self->{line};
5504     $self->{column_prev} = $self->{column};
5505     $self->{column}++;
5506     $self->{nc}
5507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508     } else {
5509     $self->{set_nc}->($self);
5510     }
5511    
5512     redo A;
5513     } elsif ($self->{nc} == -1) {
5514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5515 wakaba 1.13 delete $self->{in_subset};
5516 wakaba 1.12 $self->{state} = DATA_STATE;
5517     $self->{s_kwd} = '';
5518     ## Reconsume.
5519 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5520 wakaba 1.12 redo A;
5521     } else {
5522     unless ($self->{internal_subset_tainted}) {
5523     ## XML5: No parse error.
5524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5525     $self->{internal_subset_tainted} = 1;
5526     }
5527     ## Stay in the state.
5528    
5529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5530     $self->{line_prev} = $self->{line};
5531     $self->{column_prev} = $self->{column};
5532     $self->{column}++;
5533     $self->{nc}
5534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5535     } else {
5536     $self->{set_nc}->($self);
5537     }
5538    
5539     redo A;
5540     }
5541     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5542     if ($self->{nc} == 0x003E) { # >
5543     $self->{state} = DATA_STATE;
5544     $self->{s_kwd} = '';
5545    
5546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5547     $self->{line_prev} = $self->{line};
5548     $self->{column_prev} = $self->{column};
5549     $self->{column}++;
5550     $self->{nc}
5551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5552     } else {
5553     $self->{set_nc}->($self);
5554     }
5555    
5556 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5557 wakaba 1.12 redo A;
5558     } elsif ($self->{nc} == -1) {
5559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5560     $self->{state} = DATA_STATE;
5561     $self->{s_kwd} = '';
5562     ## Reconsume.
5563 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5564 wakaba 1.12 redo A;
5565     } else {
5566     ## XML5: No parse error and stay in the state.
5567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5568    
5569 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5570    
5571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5572     $self->{line_prev} = $self->{line};
5573     $self->{column_prev} = $self->{column};
5574     $self->{column}++;
5575     $self->{nc}
5576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5577     } else {
5578     $self->{set_nc}->($self);
5579     }
5580    
5581     redo A;
5582     }
5583     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5584     if ($self->{nc} == 0x003E) { # >
5585     $self->{state} = DATA_STATE;
5586     $self->{s_kwd} = '';
5587    
5588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5589     $self->{line_prev} = $self->{line};
5590     $self->{column_prev} = $self->{column};
5591     $self->{column}++;
5592     $self->{nc}
5593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5594     } else {
5595     $self->{set_nc}->($self);
5596     }
5597    
5598     return ({type => END_OF_DOCTYPE_TOKEN});
5599     redo A;
5600     } elsif ($self->{nc} == -1) {
5601     $self->{state} = DATA_STATE;
5602     $self->{s_kwd} = '';
5603     ## Reconsume.
5604     return ({type => END_OF_DOCTYPE_TOKEN});
5605     redo A;
5606     } else {
5607     ## Stay in the state.
5608    
5609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5610     $self->{line_prev} = $self->{line};
5611     $self->{column_prev} = $self->{column};
5612     $self->{column}++;
5613     $self->{nc}
5614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5615     } else {
5616     $self->{set_nc}->($self);
5617     }
5618    
5619     redo A;
5620     }
5621     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5622     if ($self->{nc} == 0x0021) { # !
5623 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5624 wakaba 1.13
5625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5626     $self->{line_prev} = $self->{line};
5627     $self->{column_prev} = $self->{column};
5628     $self->{column}++;
5629     $self->{nc}
5630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5631     } else {
5632     $self->{set_nc}->($self);
5633     }
5634    
5635     redo A;
5636     } elsif ($self->{nc} == 0x003F) { # ?
5637     $self->{state} = PI_STATE;
5638    
5639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5640     $self->{line_prev} = $self->{line};
5641     $self->{column_prev} = $self->{column};
5642     $self->{column}++;
5643     $self->{nc}
5644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5645     } else {
5646     $self->{set_nc}->($self);
5647     }
5648    
5649     redo A;
5650     } elsif ($self->{nc} == -1) {
5651     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5652     $self->{state} = DATA_STATE;
5653     $self->{s_kwd} = '';
5654     ## Reconsume.
5655     redo A;
5656     } else {
5657     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5658     line => $self->{line_prev},
5659     column => $self->{column_prev});
5660     $self->{state} = BOGUS_COMMENT_STATE;
5661     $self->{ct} = {type => COMMENT_TOKEN,
5662     data => '',
5663     }; ## NOTE: Will be discarded.
5664 wakaba 1.12
5665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666     $self->{line_prev} = $self->{line};
5667     $self->{column_prev} = $self->{column};
5668     $self->{column}++;
5669     $self->{nc}
5670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671     } else {
5672     $self->{set_nc}->($self);
5673     }
5674    
5675     redo A;
5676     }
5677 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5678     ## XML5: "DOCTYPE markup declaration state".
5679    
5680     if ($self->{nc} == 0x002D) { # -
5681     $self->{state} = MD_HYPHEN_STATE;
5682    
5683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5684     $self->{line_prev} = $self->{line};
5685     $self->{column_prev} = $self->{column};
5686     $self->{column}++;
5687     $self->{nc}
5688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5689     } else {
5690     $self->{set_nc}->($self);
5691     }
5692    
5693     redo A;
5694 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5695     $self->{nc} == 0x0065) { # e
5696 wakaba 1.14 $self->{state} = MD_E_STATE;
5697     $self->{kwd} = chr $self->{nc};
5698    
5699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5700     $self->{line_prev} = $self->{line};
5701     $self->{column_prev} = $self->{column};
5702     $self->{column}++;
5703     $self->{nc}
5704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5705     } else {
5706     $self->{set_nc}->($self);
5707     }
5708    
5709     redo A;
5710 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5711     $self->{nc} == 0x0061) { # a
5712 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5713     $self->{kwd} = chr $self->{nc};
5714    
5715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5716     $self->{line_prev} = $self->{line};
5717     $self->{column_prev} = $self->{column};
5718     $self->{column}++;
5719     $self->{nc}
5720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5721     } else {
5722     $self->{set_nc}->($self);
5723     }
5724    
5725     redo A;
5726 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5727     $self->{nc} == 0x006E) { # n
5728 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5729     $self->{kwd} = chr $self->{nc};
5730    
5731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5732     $self->{line_prev} = $self->{line};
5733     $self->{column_prev} = $self->{column};
5734     $self->{column}++;
5735     $self->{nc}
5736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5737     } else {
5738     $self->{set_nc}->($self);
5739     }
5740    
5741     redo A;
5742     } else {
5743     #
5744     }
5745    
5746     ## XML5: No parse error.
5747     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5748     line => $self->{line_prev},
5749     column => $self->{column_prev} - 1);
5750     ## Reconsume.
5751     $self->{state} = BOGUS_COMMENT_STATE;
5752     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5753     redo A;
5754     } elsif ($self->{state} == MD_E_STATE) {
5755 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5756     $self->{nc} == 0x006E) { # n
5757 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5758     $self->{kwd} .= chr $self->{nc};
5759    
5760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5761     $self->{line_prev} = $self->{line};
5762     $self->{column_prev} = $self->{column};
5763     $self->{column}++;
5764     $self->{nc}
5765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5766     } else {
5767     $self->{set_nc}->($self);
5768     }
5769    
5770     redo A;
5771 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5772     $self->{nc} == 0x006C) { # l
5773 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5774     $self->{state} = MD_ELEMENT_STATE;
5775     $self->{kwd} .= chr $self->{nc};
5776    
5777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778     $self->{line_prev} = $self->{line};
5779     $self->{column_prev} = $self->{column};
5780     $self->{column}++;
5781     $self->{nc}
5782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783     } else {
5784     $self->{set_nc}->($self);
5785     }
5786    
5787     redo A;
5788     } else {
5789     ## XML5: No parse error.
5790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5791     line => $self->{line_prev},
5792     column => $self->{column_prev} - 2
5793     + 1 * ($self->{nc} == -1));
5794     ## Reconsume.
5795     $self->{state} = BOGUS_COMMENT_STATE;
5796     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5797     redo A;
5798     }
5799     } elsif ($self->{state} == MD_ENTITY_STATE) {
5800 wakaba 1.17 if ($self->{nc} == [
5801     undef,
5802     undef,
5803     0x0054, # T
5804     0x0049, # I
5805     0x0054, # T
5806     ]->[length $self->{kwd}] or
5807     $self->{nc} == [
5808     undef,
5809     undef,
5810     0x0074, # t
5811     0x0069, # i
5812     0x0074, # t
5813     ]->[length $self->{kwd}]) {
5814 wakaba 1.14 ## Stay in the state.
5815     $self->{kwd} .= chr $self->{nc};
5816    
5817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5818     $self->{line_prev} = $self->{line};
5819     $self->{column_prev} = $self->{column};
5820     $self->{column}++;
5821     $self->{nc}
5822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5823     } else {
5824     $self->{set_nc}->($self);
5825     }
5826    
5827     redo A;
5828 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5829     ($self->{nc} == 0x0059 or # Y
5830     $self->{nc} == 0x0079)) { # y
5831     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5833     text => 'ENTITY',
5834     line => $self->{line_prev},
5835     column => $self->{column_prev} - 4);
5836     }
5837     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5838 wakaba 1.14 line => $self->{line_prev},
5839     column => $self->{column_prev} - 6};
5840     $self->{state} = DOCTYPE_MD_STATE;
5841    
5842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5843     $self->{line_prev} = $self->{line};
5844     $self->{column_prev} = $self->{column};
5845     $self->{column}++;
5846     $self->{nc}
5847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5848     } else {
5849     $self->{set_nc}->($self);
5850     }
5851    
5852     redo A;
5853     } else {
5854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5855     line => $self->{line_prev},
5856     column => $self->{column_prev} - 1
5857     - (length $self->{kwd})
5858     + 1 * ($self->{nc} == -1));
5859     $self->{state} = BOGUS_COMMENT_STATE;
5860     ## Reconsume.
5861     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5862     redo A;
5863     }
5864     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5865 wakaba 1.17 if ($self->{nc} == [
5866     undef,
5867     undef,
5868     0x0045, # E
5869     0x004D, # M
5870     0x0045, # E
5871     0x004E, # N
5872     ]->[length $self->{kwd}] or
5873     $self->{nc} == [
5874     undef,
5875     undef,
5876     0x0065, # e
5877     0x006D, # m
5878     0x0065, # e
5879     0x006E, # n
5880     ]->[length $self->{kwd}]) {
5881 wakaba 1.14 ## Stay in the state.
5882     $self->{kwd} .= chr $self->{nc};
5883    
5884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885     $self->{line_prev} = $self->{line};
5886     $self->{column_prev} = $self->{column};
5887     $self->{column}++;
5888     $self->{nc}
5889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890     } else {
5891     $self->{set_nc}->($self);
5892     }
5893    
5894     redo A;
5895 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5896     ($self->{nc} == 0x0054 or # T
5897     $self->{nc} == 0x0074)) { # t
5898     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5899     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5900     text => 'ELEMENT',
5901     line => $self->{line_prev},
5902     column => $self->{column_prev} - 5);
5903     }
5904 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5905     line => $self->{line_prev},
5906     column => $self->{column_prev} - 6};
5907     $self->{state} = DOCTYPE_MD_STATE;
5908    
5909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5910     $self->{line_prev} = $self->{line};
5911     $self->{column_prev} = $self->{column};
5912     $self->{column}++;
5913     $self->{nc}
5914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5915     } else {
5916     $self->{set_nc}->($self);
5917     }
5918    
5919     redo A;
5920     } else {
5921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5922     line => $self->{line_prev},
5923     column => $self->{column_prev} - 1
5924     - (length $self->{kwd})
5925     + 1 * ($self->{nc} == -1));
5926     $self->{state} = BOGUS_COMMENT_STATE;
5927     ## Reconsume.
5928     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5929     redo A;
5930     }
5931     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5932 wakaba 1.17 if ($self->{nc} == [
5933     undef,
5934     0x0054, # T
5935     0x0054, # T
5936     0x004C, # L
5937     0x0049, # I
5938     0x0053, # S
5939     ]->[length $self->{kwd}] or
5940     $self->{nc} == [
5941     undef,
5942     0x0074, # t
5943     0x0074, # t
5944     0x006C, # l
5945     0x0069, # i
5946     0x0073, # s
5947     ]->[length $self->{kwd}]) {
5948 wakaba 1.14 ## Stay in the state.
5949     $self->{kwd} .= chr $self->{nc};
5950    
5951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5952     $self->{line_prev} = $self->{line};
5953     $self->{column_prev} = $self->{column};
5954     $self->{column}++;
5955     $self->{nc}
5956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5957     } else {
5958     $self->{set_nc}->($self);
5959     }
5960    
5961     redo A;
5962 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5963     ($self->{nc} == 0x0054 or # T
5964     $self->{nc} == 0x0074)) { # t
5965     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5966     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5967     text => 'ATTLIST',
5968     line => $self->{line_prev},
5969     column => $self->{column_prev} - 5);
5970     }
5971 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5972 wakaba 1.15 attrdefs => [],
5973 wakaba 1.14 line => $self->{line_prev},
5974     column => $self->{column_prev} - 6};
5975     $self->{state} = DOCTYPE_MD_STATE;
5976    
5977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978     $self->{line_prev} = $self->{line};
5979     $self->{column_prev} = $self->{column};
5980     $self->{column}++;
5981     $self->{nc}
5982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983     } else {
5984     $self->{set_nc}->($self);
5985     }
5986    
5987     redo A;
5988     } else {
5989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5990     line => $self->{line_prev},
5991     column => $self->{column_prev} - 1
5992     - (length $self->{kwd})
5993     + 1 * ($self->{nc} == -1));
5994     $self->{state} = BOGUS_COMMENT_STATE;
5995     ## Reconsume.
5996     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5997     redo A;
5998     }
5999     } elsif ($self->{state} == MD_NOTATION_STATE) {
6000 wakaba 1.17 if ($self->{nc} == [
6001     undef,
6002     0x004F, # O
6003     0x0054, # T
6004     0x0041, # A
6005     0x0054, # T
6006     0x0049, # I
6007     0x004F, # O
6008     ]->[length $self->{kwd}] or
6009     $self->{nc} == [
6010     undef,
6011     0x006F, # o
6012     0x0074, # t
6013     0x0061, # a
6014     0x0074, # t
6015     0x0069, # i
6016     0x006F, # o
6017     ]->[length $self->{kwd}]) {
6018 wakaba 1.14 ## Stay in the state.
6019     $self->{kwd} .= chr $self->{nc};
6020    
6021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6022     $self->{line_prev} = $self->{line};
6023     $self->{column_prev} = $self->{column};
6024     $self->{column}++;
6025     $self->{nc}
6026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6027     } else {
6028     $self->{set_nc}->($self);
6029     }
6030    
6031     redo A;
6032 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6033     ($self->{nc} == 0x004E or # N
6034     $self->{nc} == 0x006E)) { # n
6035     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6037     text => 'NOTATION',
6038     line => $self->{line_prev},
6039     column => $self->{column_prev} - 6);
6040     }
6041 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6042     line => $self->{line_prev},
6043     column => $self->{column_prev} - 6};
6044     $self->{state} = DOCTYPE_MD_STATE;
6045    
6046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047     $self->{line_prev} = $self->{line};
6048     $self->{column_prev} = $self->{column};
6049     $self->{column}++;
6050     $self->{nc}
6051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052     } else {
6053     $self->{set_nc}->($self);
6054     }
6055    
6056     redo A;
6057     } else {
6058     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6059     line => $self->{line_prev},
6060     column => $self->{column_prev} - 1
6061     - (length $self->{kwd})
6062     + 1 * ($self->{nc} == -1));
6063     $self->{state} = BOGUS_COMMENT_STATE;
6064     ## Reconsume.
6065     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6066     redo A;
6067     }
6068     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6069     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6070     ## "DOCTYPE NOTATION state".
6071    
6072     if ($is_space->{$self->{nc}}) {
6073     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6074     $self->{state} = BEFORE_MD_NAME_STATE;
6075    
6076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6077     $self->{line_prev} = $self->{line};
6078     $self->{column_prev} = $self->{column};
6079     $self->{column}++;
6080     $self->{nc}
6081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6082     } else {
6083     $self->{set_nc}->($self);
6084     }
6085    
6086     redo A;
6087     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6088     $self->{nc} == 0x0025) { # %
6089     ## XML5: Switch to the "DOCTYPE bogus comment state".
6090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6091     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6092    
6093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6094     $self->{line_prev} = $self->{line};
6095     $self->{column_prev} = $self->{column};
6096     $self->{column}++;
6097     $self->{nc}
6098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6099     } else {
6100     $self->{set_nc}->($self);
6101     }
6102    
6103     redo A;
6104     } elsif ($self->{nc} == -1) {
6105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6106     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6107     ## Reconsume.
6108     redo A;
6109     } elsif ($self->{nc} == 0x003E) { # >
6110     ## XML5: Switch to the "DOCTYPE bogus comment state".
6111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6112     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6113    
6114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6115     $self->{line_prev} = $self->{line};
6116     $self->{column_prev} = $self->{column};
6117     $self->{column}++;
6118     $self->{nc}
6119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6120     } else {
6121     $self->{set_nc}->($self);
6122     }
6123    
6124     redo A;
6125     } else {
6126     ## XML5: Switch to the "DOCTYPE bogus comment state".
6127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6128     $self->{state} = BEFORE_MD_NAME_STATE;
6129     redo A;
6130     }
6131     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6132     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6133     ## before state", "DOCTYPE ATTLIST name before state".
6134    
6135     if ($is_space->{$self->{nc}}) {
6136     ## Stay in the state.
6137    
6138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139     $self->{line_prev} = $self->{line};
6140     $self->{column_prev} = $self->{column};
6141     $self->{column}++;
6142     $self->{nc}
6143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144     } else {
6145     $self->{set_nc}->($self);
6146     }
6147    
6148     redo A;
6149     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6150     $self->{nc} == 0x0025) { # %
6151     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6152    
6153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6154     $self->{line_prev} = $self->{line};
6155     $self->{column_prev} = $self->{column};
6156     $self->{column}++;
6157     $self->{nc}
6158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6159     } else {
6160     $self->{set_nc}->($self);
6161     }
6162    
6163     redo A;
6164     } elsif ($self->{nc} == 0x003E) { # >
6165     ## XML5: Same as "Anything else".
6166     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6167     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6168    
6169     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6170     $self->{line_prev} = $self->{line};
6171     $self->{column_prev} = $self->{column};
6172     $self->{column}++;
6173     $self->{nc}
6174     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6175     } else {
6176     $self->{set_nc}->($self);
6177     }
6178    
6179     redo A;
6180     } elsif ($self->{nc} == -1) {
6181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6182     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6183     ## Reconsume.
6184     redo A;
6185     } else {
6186     ## XML5: [ATTLIST] Not defined yet.
6187     $self->{ct}->{name} .= chr $self->{nc};
6188     $self->{state} = MD_NAME_STATE;
6189    
6190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6191     $self->{line_prev} = $self->{line};
6192     $self->{column_prev} = $self->{column};
6193     $self->{column}++;
6194     $self->{nc}
6195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6196     } else {
6197     $self->{set_nc}->($self);
6198     }
6199    
6200     redo A;
6201     }
6202     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6203     if ($is_space->{$self->{nc}}) {
6204     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6205     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6206     $self->{state} = BEFORE_MD_NAME_STATE;
6207 wakaba 1.8
6208 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6209     $self->{line_prev} = $self->{line};
6210     $self->{column_prev} = $self->{column};
6211     $self->{column}++;
6212     $self->{nc}
6213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6214     } else {
6215     $self->{set_nc}->($self);
6216     }
6217    
6218     redo A;
6219     } elsif ($self->{nc} == 0x003E) { # >
6220     ## XML5: Same as "Anything else".
6221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6222     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6223    
6224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6225     $self->{line_prev} = $self->{line};
6226     $self->{column_prev} = $self->{column};
6227     $self->{column}++;
6228     $self->{nc}
6229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6230     } else {
6231     $self->{set_nc}->($self);
6232     }
6233    
6234     redo A;
6235     } elsif ($self->{nc} == -1) {
6236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6237     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6238     ## Reconsume.
6239     redo A;
6240     } else {
6241     ## XML5: No parse error.
6242     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6243     $self->{state} = BOGUS_COMMENT_STATE;
6244     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6245     ## Reconsume.
6246     redo A;
6247     }
6248     } elsif ($self->{state} == MD_NAME_STATE) {
6249     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6250    
6251     if ($is_space->{$self->{nc}}) {
6252 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6253     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6254     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6255 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6256 wakaba 1.16 } else { # ENTITY/NOTATION
6257     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6258     }
6259 wakaba 1.14
6260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6261     $self->{line_prev} = $self->{line};
6262     $self->{column_prev} = $self->{column};
6263     $self->{column}++;
6264     $self->{nc}
6265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6266     } else {
6267     $self->{set_nc}->($self);
6268     }
6269    
6270     redo A;
6271     } elsif ($self->{nc} == 0x003E) { # >
6272     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6273     #
6274     } else {
6275 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6276 wakaba 1.14 }
6277     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6278    
6279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6280     $self->{line_prev} = $self->{line};
6281     $self->{column_prev} = $self->{column};
6282     $self->{column}++;
6283     $self->{nc}
6284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6285     } else {
6286     $self->{set_nc}->($self);
6287     }
6288    
6289     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6290     redo A;
6291     } elsif ($self->{nc} == -1) {
6292     ## XML5: [ATTLIST] No parse error.
6293     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6294     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6295     ## Reconsume.
6296     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6297     redo A;
6298     } else {
6299     ## XML5: [ATTLIST] Not defined yet.
6300     $self->{ct}->{name} .= chr $self->{nc};
6301     ## Stay in the state.
6302    
6303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304     $self->{line_prev} = $self->{line};
6305     $self->{column_prev} = $self->{column};
6306     $self->{column}++;
6307     $self->{nc}
6308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309     } else {
6310     $self->{set_nc}->($self);
6311     }
6312    
6313     redo A;
6314     }
6315     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6316     if ($is_space->{$self->{nc}}) {
6317     ## Stay in the state.
6318    
6319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320     $self->{line_prev} = $self->{line};
6321     $self->{column_prev} = $self->{column};
6322     $self->{column}++;
6323     $self->{nc}
6324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325     } else {
6326     $self->{set_nc}->($self);
6327     }
6328    
6329     redo A;
6330     } elsif ($self->{nc} == 0x003E) { # >
6331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6332    
6333     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6334     $self->{line_prev} = $self->{line};
6335     $self->{column_prev} = $self->{column};
6336     $self->{column}++;
6337     $self->{nc}
6338     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6339     } else {
6340     $self->{set_nc}->($self);
6341     }
6342    
6343     return ($self->{ct}); # ATTLIST
6344     redo A;
6345     } elsif ($self->{nc} == -1) {
6346     ## XML5: No parse error.
6347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6348     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6349 wakaba 1.15 return ($self->{ct});
6350 wakaba 1.14 redo A;
6351     } else {
6352     ## XML5: Not defined yet.
6353 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6354     tokens => [],
6355     line => $self->{line}, column => $self->{column}};
6356     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6357    
6358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6359     $self->{line_prev} = $self->{line};
6360     $self->{column_prev} = $self->{column};
6361     $self->{column}++;
6362     $self->{nc}
6363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6364     } else {
6365     $self->{set_nc}->($self);
6366     }
6367    
6368     redo A;
6369     }
6370     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6371     if ($is_space->{$self->{nc}}) {
6372     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6373    
6374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375     $self->{line_prev} = $self->{line};
6376     $self->{column_prev} = $self->{column};
6377     $self->{column}++;
6378     $self->{nc}
6379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380     } else {
6381     $self->{set_nc}->($self);
6382     }
6383    
6384     redo A;
6385     } elsif ($self->{nc} == 0x003E) { # >
6386     ## XML5: Same as "anything else".
6387     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6388     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6389    
6390     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6391     $self->{line_prev} = $self->{line};
6392     $self->{column_prev} = $self->{column};
6393     $self->{column}++;
6394     $self->{nc}
6395     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6396     } else {
6397     $self->{set_nc}->($self);
6398     }
6399    
6400     return ($self->{ct}); # ATTLIST
6401     redo A;
6402     } elsif ($self->{nc} == 0x0028) { # (
6403     ## XML5: Same as "anything else".
6404     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6405     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6406    
6407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408     $self->{line_prev} = $self->{line};
6409     $self->{column_prev} = $self->{column};
6410     $self->{column}++;
6411     $self->{nc}
6412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413     } else {
6414     $self->{set_nc}->($self);
6415     }
6416    
6417     redo A;
6418     } elsif ($self->{nc} == -1) {
6419     ## XML5: No parse error.
6420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6421     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6422    
6423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424     $self->{line_prev} = $self->{line};
6425     $self->{column_prev} = $self->{column};
6426     $self->{column}++;
6427     $self->{nc}
6428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429     } else {
6430     $self->{set_nc}->($self);
6431     }
6432    
6433     return ($self->{ct}); # ATTLIST
6434     redo A;
6435     } else {
6436     ## XML5: Not defined yet.
6437     $self->{ca}->{name} .= chr $self->{nc};
6438     ## Stay in the state.
6439    
6440     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6441     $self->{line_prev} = $self->{line};
6442     $self->{column_prev} = $self->{column};
6443     $self->{column}++;
6444     $self->{nc}
6445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6446     } else {
6447     $self->{set_nc}->($self);
6448     }
6449    
6450 wakaba 1.14 redo A;
6451     }
6452 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6453     if ($is_space->{$self->{nc}}) {
6454     ## Stay in the state.
6455    
6456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6457     $self->{line_prev} = $self->{line};
6458     $self->{column_prev} = $self->{column};
6459     $self->{column}++;
6460     $self->{nc}
6461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6462     } else {
6463     $self->{set_nc}->($self);
6464     }
6465    
6466     redo A;
6467     } elsif ($self->{nc} == 0x003E) { # >
6468     ## XML5: Same as "anything else".
6469     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6470     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6471    
6472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6473     $self->{line_prev} = $self->{line};
6474     $self->{column_prev} = $self->{column};
6475     $self->{column}++;
6476     $self->{nc}
6477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6478     } else {
6479     $self->{set_nc}->($self);
6480     }
6481    
6482     return ($self->{ct}); # ATTLIST
6483     redo A;
6484     } elsif ($self->{nc} == 0x0028) { # (
6485     ## XML5: Same as "anything else".
6486     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6487    
6488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6489     $self->{line_prev} = $self->{line};
6490     $self->{column_prev} = $self->{column};
6491     $self->{column}++;
6492     $self->{nc}
6493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6494     } else {
6495     $self->{set_nc}->($self);
6496     }
6497    
6498     redo A;
6499     } elsif ($self->{nc} == -1) {
6500     ## XML5: No parse error.
6501     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6502     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6503    
6504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6505     $self->{line_prev} = $self->{line};
6506     $self->{column_prev} = $self->{column};
6507     $self->{column}++;
6508     $self->{nc}
6509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6510     } else {
6511     $self->{set_nc}->($self);
6512     }
6513    
6514     return ($self->{ct});
6515     redo A;
6516     } else {
6517     ## XML5: Not defined yet.
6518     $self->{ca}->{type} = chr $self->{nc};
6519     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6520    
6521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6522     $self->{line_prev} = $self->{line};
6523     $self->{column_prev} = $self->{column};
6524     $self->{column}++;
6525     $self->{nc}
6526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6527     } else {
6528     $self->{set_nc}->($self);
6529     }
6530    
6531     redo A;
6532     }
6533     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6534     if ($is_space->{$self->{nc}}) {
6535     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6536    
6537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6538     $self->{line_prev} = $self->{line};
6539     $self->{column_prev} = $self->{column};
6540     $self->{column}++;
6541     $self->{nc}
6542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6543     } else {
6544     $self->{set_nc}->($self);
6545     }
6546    
6547     redo A;
6548     } elsif ($self->{nc} == 0x0023) { # #
6549     ## XML5: Same as "anything else".
6550     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6551     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6552    
6553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6554     $self->{line_prev} = $self->{line};
6555     $self->{column_prev} = $self->{column};
6556     $self->{column}++;
6557     $self->{nc}
6558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6559     } else {
6560     $self->{set_nc}->($self);
6561     }
6562    
6563     redo A;
6564     } elsif ($self->{nc} == 0x0022) { # "
6565     ## XML5: Same as "anything else".
6566     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6567     $self->{ca}->{value} = '';
6568     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6569    
6570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6571     $self->{line_prev} = $self->{line};
6572     $self->{column_prev} = $self->{column};
6573     $self->{column}++;
6574     $self->{nc}
6575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6576     } else {
6577     $self->{set_nc}->($self);
6578     }
6579    
6580     redo A;
6581     } elsif ($self->{nc} == 0x0027) { # '
6582     ## XML5: Same as "anything else".
6583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6584     $self->{ca}->{value} = '';
6585     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6586    
6587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6588     $self->{line_prev} = $self->{line};
6589     $self->{column_prev} = $self->{column};
6590     $self->{column}++;
6591     $self->{nc}
6592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6593     } else {
6594     $self->{set_nc}->($self);
6595     }
6596    
6597     redo A;
6598     } elsif ($self->{nc} == 0x003E) { # >
6599     ## XML5: Same as "anything else".
6600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6601     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6602    
6603     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6604     $self->{line_prev} = $self->{line};
6605     $self->{column_prev} = $self->{column};
6606     $self->{column}++;
6607     $self->{nc}
6608     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6609     } else {
6610     $self->{set_nc}->($self);
6611     }
6612    
6613     return ($self->{ct}); # ATTLIST
6614     redo A;
6615     } elsif ($self->{nc} == 0x0028) { # (
6616     ## XML5: Same as "anything else".
6617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6618     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6619    
6620     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6621     $self->{line_prev} = $self->{line};
6622     $self->{column_prev} = $self->{column};
6623     $self->{column}++;
6624     $self->{nc}
6625     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6626     } else {
6627     $self->{set_nc}->($self);
6628     }
6629    
6630     redo A;
6631     } elsif ($self->{nc} == -1) {
6632     ## XML5: No parse error.
6633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6634     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6635    
6636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6637     $self->{line_prev} = $self->{line};
6638     $self->{column_prev} = $self->{column};
6639     $self->{column}++;
6640     $self->{nc}
6641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6642     } else {
6643     $self->{set_nc}->($self);
6644     }
6645    
6646     return ($self->{ct});
6647     redo A;
6648     } else {
6649     ## XML5: Not defined yet.
6650     $self->{ca}->{type} .= chr $self->{nc};
6651     ## Stay in the state.
6652    
6653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6654     $self->{line_prev} = $self->{line};
6655     $self->{column_prev} = $self->{column};
6656     $self->{column}++;
6657     $self->{nc}
6658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6659     } else {
6660     $self->{set_nc}->($self);
6661     }
6662    
6663     redo A;
6664     }
6665     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6666     if ($is_space->{$self->{nc}}) {
6667     ## Stay in the state.
6668    
6669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6670     $self->{line_prev} = $self->{line};
6671     $self->{column_prev} = $self->{column};
6672     $self->{column}++;
6673     $self->{nc}
6674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6675     } else {
6676     $self->{set_nc}->($self);
6677     }
6678    
6679     redo A;
6680     } elsif ($self->{nc} == 0x0028) { # (
6681     ## XML5: Same as "anything else".
6682     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6683    
6684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6685     $self->{line_prev} = $self->{line};
6686     $self->{column_prev} = $self->{column};
6687     $self->{column}++;
6688     $self->{nc}
6689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6690     } else {
6691     $self->{set_nc}->($self);
6692     }
6693    
6694     redo A;
6695     } elsif ($self->{nc} == 0x0023) { # #
6696     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6697    
6698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699     $self->{line_prev} = $self->{line};
6700     $self->{column_prev} = $self->{column};
6701     $self->{column}++;
6702     $self->{nc}
6703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704     } else {
6705     $self->{set_nc}->($self);
6706     }
6707    
6708     redo A;
6709     } elsif ($self->{nc} == 0x0022) { # "
6710     ## XML5: Same as "anything else".
6711     $self->{ca}->{value} = '';
6712     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6713    
6714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6715     $self->{line_prev} = $self->{line};
6716     $self->{column_prev} = $self->{column};
6717     $self->{column}++;
6718     $self->{nc}
6719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6720     } else {
6721     $self->{set_nc}->($self);
6722     }
6723    
6724     redo A;
6725     } elsif ($self->{nc} == 0x0027) { # '
6726     ## XML5: Same as "anything else".
6727     $self->{ca}->{value} = '';
6728     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6729    
6730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731     $self->{line_prev} = $self->{line};
6732     $self->{column_prev} = $self->{column};
6733     $self->{column}++;
6734     $self->{nc}
6735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736     } else {
6737     $self->{set_nc}->($self);
6738     }
6739    
6740     redo A;
6741     } elsif ($self->{nc} == 0x003E) { # >
6742     ## XML5: Same as "anything else".
6743     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6744     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6745    
6746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6747     $self->{line_prev} = $self->{line};
6748     $self->{column_prev} = $self->{column};
6749     $self->{column}++;
6750     $self->{nc}
6751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6752     } else {
6753     $self->{set_nc}->($self);
6754     }
6755    
6756     return ($self->{ct}); # ATTLIST
6757     redo A;
6758     } elsif ($self->{nc} == -1) {
6759     ## XML5: No parse error.
6760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6761     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6762    
6763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6764     $self->{line_prev} = $self->{line};
6765     $self->{column_prev} = $self->{column};
6766     $self->{column}++;
6767     $self->{nc}
6768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6769     } else {
6770     $self->{set_nc}->($self);
6771     }
6772    
6773     return ($self->{ct});
6774     redo A;
6775     } else {
6776     ## XML5: Switch to the "DOCTYPE bogus comment state".
6777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6778     $self->{ca}->{value} = '';
6779     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6780     ## Reconsume.
6781     redo A;
6782     }
6783     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6784     if ($is_space->{$self->{nc}}) {
6785     ## Stay in the state.
6786    
6787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788     $self->{line_prev} = $self->{line};
6789     $self->{column_prev} = $self->{column};
6790     $self->{column}++;
6791     $self->{nc}
6792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793     } else {
6794     $self->{set_nc}->($self);
6795     }
6796    
6797     redo A;
6798     } elsif ($self->{nc} == 0x007C) { # |
6799     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6800     ## Stay in the state.
6801    
6802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6803     $self->{line_prev} = $self->{line};
6804     $self->{column_prev} = $self->{column};
6805     $self->{column}++;
6806     $self->{nc}
6807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6808     } else {
6809     $self->{set_nc}->($self);
6810     }
6811    
6812     redo A;
6813     } elsif ($self->{nc} == 0x0029) { # )
6814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6815     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6816    
6817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6818     $self->{line_prev} = $self->{line};
6819     $self->{column_prev} = $self->{column};
6820     $self->{column}++;
6821     $self->{nc}
6822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6823     } else {
6824     $self->{set_nc}->($self);
6825     }
6826    
6827     redo A;
6828     } elsif ($self->{nc} == 0x003E) { # >
6829     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6830     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6831    
6832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6833     $self->{line_prev} = $self->{line};
6834     $self->{column_prev} = $self->{column};
6835     $self->{column}++;
6836     $self->{nc}
6837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6838     } else {
6839     $self->{set_nc}->($self);
6840     }
6841    
6842     return ($self->{ct}); # ATTLIST
6843     redo A;
6844     } elsif ($self->{nc} == -1) {
6845     ## XML5: No parse error.
6846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6847     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6848    
6849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6850     $self->{line_prev} = $self->{line};
6851     $self->{column_prev} = $self->{column};
6852     $self->{column}++;
6853     $self->{nc}
6854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6855     } else {
6856     $self->{set_nc}->($self);
6857     }
6858    
6859     return ($self->{ct});
6860     redo A;
6861     } else {
6862     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6863     $self->{state} = ALLOWED_TOKEN_STATE;
6864    
6865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6866     $self->{line_prev} = $self->{line};
6867     $self->{column_prev} = $self->{column};
6868     $self->{column}++;
6869     $self->{nc}
6870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6871     } else {
6872     $self->{set_nc}->($self);
6873     }
6874    
6875     redo A;
6876     }
6877     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6878     if ($is_space->{$self->{nc}}) {
6879     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6880    
6881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882     $self->{line_prev} = $self->{line};
6883     $self->{column_prev} = $self->{column};
6884     $self->{column}++;
6885     $self->{nc}
6886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887     } else {
6888     $self->{set_nc}->($self);
6889     }
6890    
6891     redo A;
6892     } elsif ($self->{nc} == 0x007C) { # |
6893     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6894    
6895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896     $self->{line_prev} = $self->{line};
6897     $self->{column_prev} = $self->{column};
6898     $self->{column}++;
6899     $self->{nc}
6900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901     } else {
6902     $self->{set_nc}->($self);
6903     }
6904    
6905     redo A;
6906     } elsif ($self->{nc} == 0x0029) { # )
6907     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6908    
6909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6910     $self->{line_prev} = $self->{line};
6911     $self->{column_prev} = $self->{column};
6912     $self->{column}++;
6913     $self->{nc}
6914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6915     } else {
6916     $self->{set_nc}->($self);
6917     }
6918    
6919     redo A;
6920     } elsif ($self->{nc} == 0x003E) { # >
6921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6922     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6923    
6924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925     $self->{line_prev} = $self->{line};
6926     $self->{column_prev} = $self->{column};
6927     $self->{column}++;
6928     $self->{nc}
6929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930     } else {
6931     $self->{set_nc}->($self);
6932     }
6933    
6934     return ($self->{ct}); # ATTLIST
6935     redo A;
6936     } elsif ($self->{nc} == -1) {
6937     ## XML5: No parse error.
6938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6939     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6940    
6941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942     $self->{line_prev} = $self->{line};
6943     $self->{column_prev} = $self->{column};
6944     $self->{column}++;
6945     $self->{nc}
6946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947     } else {
6948     $self->{set_nc}->($self);
6949     }
6950    
6951     return ($self->{ct});
6952     redo A;
6953     } else {
6954     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6955     ## Stay in the state.
6956    
6957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6958     $self->{line_prev} = $self->{line};
6959     $self->{column_prev} = $self->{column};
6960     $self->{column}++;
6961     $self->{nc}
6962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6963     } else {
6964     $self->{set_nc}->($self);
6965     }
6966    
6967     redo A;
6968     }
6969     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6970     if ($is_space->{$self->{nc}}) {
6971     ## Stay in the state.
6972    
6973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6974     $self->{line_prev} = $self->{line};
6975     $self->{column_prev} = $self->{column};
6976     $self->{column}++;
6977     $self->{nc}
6978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6979     } else {
6980     $self->{set_nc}->($self);
6981     }
6982    
6983     redo A;
6984     } elsif ($self->{nc} == 0x007C) { # |
6985     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6986    
6987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6988     $self->{line_prev} = $self->{line};
6989     $self->{column_prev} = $self->{column};
6990     $self->{column}++;
6991     $self->{nc}
6992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6993     } else {
6994     $self->{set_nc}->($self);
6995     }
6996    
6997     redo A;
6998     } elsif ($self->{nc} == 0x0029) { # )
6999     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7000    
7001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002     $self->{line_prev} = $self->{line};
7003     $self->{column_prev} = $self->{column};
7004     $self->{column}++;
7005     $self->{nc}
7006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007     } else {
7008     $self->{set_nc}->($self);
7009     }
7010    
7011     redo A;
7012     } elsif ($self->{nc} == 0x003E) { # >
7013     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7014     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7015    
7016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7017     $self->{line_prev} = $self->{line};
7018     $self->{column_prev} = $self->{column};
7019     $self->{column}++;
7020     $self->{nc}
7021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7022     } else {
7023     $self->{set_nc}->($self);
7024     }
7025    
7026     return ($self->{ct}); # ATTLIST
7027     redo A;
7028     } elsif ($self->{nc} == -1) {
7029     ## XML5: No parse error.
7030     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7031     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7032    
7033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034     $self->{line_prev} = $self->{line};
7035     $self->{column_prev} = $self->{column};
7036     $self->{column}++;
7037     $self->{nc}
7038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039     } else {
7040     $self->{set_nc}->($self);
7041     }
7042    
7043     return ($self->{ct});
7044     redo A;
7045     } else {
7046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7047     line => $self->{line_prev},
7048     column => $self->{column_prev});
7049     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7050     $self->{state} = ALLOWED_TOKEN_STATE;
7051    
7052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7053     $self->{line_prev} = $self->{line};
7054     $self->{column_prev} = $self->{column};
7055     $self->{column}++;
7056     $self->{nc}
7057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7058     } else {
7059     $self->{set_nc}->($self);
7060     }
7061    
7062     redo A;
7063     }
7064     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7065     if ($is_space->{$self->{nc}}) {
7066     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7067    
7068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069     $self->{line_prev} = $self->{line};
7070     $self->{column_prev} = $self->{column};
7071     $self->{column}++;
7072     $self->{nc}
7073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074     } else {
7075     $self->{set_nc}->($self);
7076     }
7077    
7078     redo A;
7079     } elsif ($self->{nc} == 0x0023) { # #
7080     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7081     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7082    
7083     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7084     $self->{line_prev} = $self->{line};
7085     $self->{column_prev} = $self->{column};
7086     $self->{column}++;
7087     $self->{nc}
7088     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7089     } else {
7090     $self->{set_nc}->($self);
7091     }
7092    
7093     redo A;
7094     } elsif ($self->{nc} == 0x0022) { # "
7095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7096     $self->{ca}->{value} = '';
7097     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7098    
7099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7100     $self->{line_prev} = $self->{line};
7101     $self->{column_prev} = $self->{column};
7102     $self->{column}++;
7103     $self->{nc}
7104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7105     } else {
7106     $self->{set_nc}->($self);
7107     }
7108    
7109     redo A;
7110     } elsif ($self->{nc} == 0x0027) { # '
7111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7112     $self->{ca}->{value} = '';
7113     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7114    
7115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116     $self->{line_prev} = $self->{line};
7117     $self->{column_prev} = $self->{column};
7118     $self->{column}++;
7119     $self->{nc}
7120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121     } else {
7122     $self->{set_nc}->($self);
7123     }
7124    
7125     redo A;
7126     } elsif ($self->{nc} == 0x003E) { # >
7127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7128     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7129    
7130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131     $self->{line_prev} = $self->{line};
7132     $self->{column_prev} = $self->{column};
7133     $self->{column}++;
7134     $self->{nc}
7135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136     } else {
7137     $self->{set_nc}->($self);
7138     }
7139    
7140     return ($self->{ct}); # ATTLIST
7141     redo A;
7142     } elsif ($self->{nc} == -1) {
7143     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7144     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7145    
7146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147     $self->{line_prev} = $self->{line};
7148     $self->{column_prev} = $self->{column};
7149     $self->{column}++;
7150     $self->{nc}
7151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152     } else {
7153     $self->{set_nc}->($self);
7154     }
7155    
7156     return ($self->{ct});
7157     redo A;
7158     } else {
7159     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7160     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7161     ## Reconsume.
7162     redo A;
7163     }
7164     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7165     if ($is_space->{$self->{nc}}) {
7166     ## Stay in the state.
7167    
7168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7169     $self->{line_prev} = $self->{line};
7170     $self->{column_prev} = $self->{column};
7171     $self->{column}++;
7172     $self->{nc}
7173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7174     } else {
7175     $self->{set_nc}->($self);
7176     }
7177    
7178     redo A;
7179     } elsif ($self->{nc} == 0x0023) { # #
7180     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7181    
7182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7183     $self->{line_prev} = $self->{line};
7184     $self->{column_prev} = $self->{column};
7185     $self->{column}++;
7186     $self->{nc}
7187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7188     } else {
7189     $self->{set_nc}->($self);
7190     }
7191    
7192     redo A;
7193     } elsif ($self->{nc} == 0x0022) { # "
7194     $self->{ca}->{value} = '';
7195     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7196    
7197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7198     $self->{line_prev} = $self->{line};
7199     $self->{column_prev} = $self->{column};
7200     $self->{column}++;
7201     $self->{nc}
7202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7203     } else {
7204     $self->{set_nc}->($self);
7205     }
7206    
7207     redo A;
7208     } elsif ($self->{nc} == 0x0027) { # '
7209     $self->{ca}->{value} = '';
7210     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7211    
7212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213     $self->{line_prev} = $self->{line};
7214     $self->{column_prev} = $self->{column};
7215     $self->{column}++;
7216     $self->{nc}
7217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218     } else {
7219     $self->{set_nc}->($self);
7220     }
7221    
7222     redo A;
7223     } elsif ($self->{nc} == 0x003E) { # >
7224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7226    
7227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228     $self->{line_prev} = $self->{line};
7229     $self->{column_prev} = $self->{column};
7230     $self->{column}++;
7231     $self->{nc}
7232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233     } else {
7234     $self->{set_nc}->($self);
7235     }
7236    
7237     return ($self->{ct}); # ATTLIST
7238     redo A;
7239     } elsif ($self->{nc} == -1) {
7240     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7241     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7242    
7243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244     $self->{line_prev} = $self->{line};
7245     $self->{column_prev} = $self->{column};
7246     $self->{column}++;
7247     $self->{nc}
7248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249     } else {
7250     $self->{set_nc}->($self);
7251     }
7252    
7253     return ($self->{ct});
7254     redo A;
7255     } else {
7256     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7257     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7258     ## Reconsume.
7259     redo A;
7260     }
7261     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7262     if ($is_space->{$self->{nc}}) {
7263     ## XML5: No parse error.
7264     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7265 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7266 wakaba 1.15 ## Reconsume.
7267     redo A;
7268     } elsif ($self->{nc} == 0x0022) { # "
7269     ## XML5: Same as "anything else".
7270     $self->{ca}->{value} = '';
7271     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7272    
7273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7274     $self->{line_prev} = $self->{line};
7275     $self->{column_prev} = $self->{column};
7276     $self->{column}++;
7277     $self->{nc}
7278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7279     } else {
7280     $self->{set_nc}->($self);
7281     }
7282    
7283     redo A;
7284     } elsif ($self->{nc} == 0x0027) { # '
7285     ## XML5: Same as "anything else".
7286     $self->{ca}->{value} = '';
7287     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7288    
7289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7290     $self->{line_prev} = $self->{line};
7291     $self->{column_prev} = $self->{column};
7292     $self->{column}++;
7293     $self->{nc}
7294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7295     } else {
7296     $self->{set_nc}->($self);
7297     }
7298    
7299     redo A;
7300     } elsif ($self->{nc} == 0x003E) { # >
7301     ## XML5: Same as "anything else".
7302     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7303     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7304    
7305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7306     $self->{line_prev} = $self->{line};
7307     $self->{column_prev} = $self->{column};
7308     $self->{column}++;
7309     $self->{nc}
7310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7311     } else {
7312     $self->{set_nc}->($self);
7313     }
7314    
7315     return ($self->{ct}); # ATTLIST
7316     redo A;
7317     } elsif ($self->{nc} == -1) {
7318     ## XML5: No parse error.
7319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7320     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7321    
7322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323     $self->{line_prev} = $self->{line};
7324     $self->{column_prev} = $self->{column};
7325     $self->{column}++;
7326     $self->{nc}
7327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328     } else {
7329     $self->{set_nc}->($self);
7330     }
7331    
7332     return ($self->{ct});
7333     redo A;
7334     } else {
7335     $self->{ca}->{default} = chr $self->{nc};
7336     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7337    
7338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339     $self->{line_prev} = $self->{line};
7340     $self->{column_prev} = $self->{column};
7341     $self->{column}++;
7342     $self->{nc}
7343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344     } else {
7345     $self->{set_nc}->($self);
7346     }
7347    
7348     redo A;
7349     }
7350     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7351     if ($is_space->{$self->{nc}}) {
7352     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7353    
7354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355     $self->{line_prev} = $self->{line};
7356     $self->{column_prev} = $self->{column};
7357     $self->{column}++;
7358     $self->{nc}
7359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360     } else {
7361     $self->{set_nc}->($self);
7362     }
7363    
7364     redo A;
7365     } elsif ($self->{nc} == 0x0022) { # "
7366     ## XML5: Same as "anything else".
7367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7368     $self->{ca}->{value} = '';
7369     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7370    
7371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7372     $self->{line_prev} = $self->{line};
7373     $self->{column_prev} = $self->{column};
7374     $self->{column}++;
7375     $self->{nc}
7376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7377     } else {
7378     $self->{set_nc}->($self);
7379     }
7380    
7381     redo A;
7382     } elsif ($self->{nc} == 0x0027) { # '
7383     ## XML5: Same as "anything else".
7384     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7385     $self->{ca}->{value} = '';
7386     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7387    
7388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7389     $self->{line_prev} = $self->{line};
7390     $self->{column_prev} = $self->{column};
7391     $self->{column}++;
7392     $self->{nc}
7393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7394     } else {
7395     $self->{set_nc}->($self);
7396     }
7397    
7398     redo A;
7399     } elsif ($self->{nc} == 0x003E) { # >
7400     ## XML5: Same as "anything else".
7401     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7402     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7403    
7404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7405     $self->{line_prev} = $self->{line};
7406     $self->{column_prev} = $self->{column};
7407     $self->{column}++;
7408     $self->{nc}
7409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7410     } else {
7411     $self->{set_nc}->($self);
7412     }
7413    
7414     return ($self->{ct}); # ATTLIST
7415     redo A;
7416     } elsif ($self->{nc} == -1) {
7417     ## XML5: No parse error.
7418     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7419     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7420     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7421    
7422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423     $self->{line_prev} = $self->{line};
7424     $self->{column_prev} = $self->{column};
7425     $self->{column}++;
7426     $self->{nc}
7427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428     } else {
7429     $self->{set_nc}->($self);
7430     }
7431    
7432     return ($self->{ct});
7433     redo A;
7434     } else {
7435     $self->{ca}->{default} .= chr $self->{nc};
7436     ## Stay in the state.
7437    
7438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7439     $self->{line_prev} = $self->{line};
7440     $self->{column_prev} = $self->{column};
7441     $self->{column}++;
7442     $self->{nc}
7443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7444     } else {
7445     $self->{set_nc}->($self);
7446     }
7447    
7448     redo A;
7449     }
7450     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7451     if ($is_space->{$self->{nc}}) {
7452     ## Stay in the state.
7453    
7454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7455     $self->{line_prev} = $self->{line};
7456     $self->{column_prev} = $self->{column};
7457     $self->{column}++;
7458     $self->{nc}
7459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7460     } else {
7461     $self->{set_nc}->($self);
7462     }
7463    
7464     redo A;
7465     } elsif ($self->{nc} == 0x0022) { # "
7466     $self->{ca}->{value} = '';
7467     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7468    
7469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470     $self->{line_prev} = $self->{line};
7471     $self->{column_prev} = $self->{column};
7472     $self->{column}++;
7473     $self->{nc}
7474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475     } else {
7476     $self->{set_nc}->($self);
7477     }
7478    
7479     redo A;
7480     } elsif ($self->{nc} == 0x0027) { # '
7481     $self->{ca}->{value} = '';
7482     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7483    
7484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485     $self->{line_prev} = $self->{line};
7486     $self->{column_prev} = $self->{column};
7487     $self->{column}++;
7488     $self->{nc}
7489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490     } else {
7491     $self->{set_nc}->($self);
7492     }
7493    
7494     redo A;
7495     } elsif ($self->{nc} == 0x003E) { # >
7496     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7497     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7498    
7499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7500     $self->{line_prev} = $self->{line};
7501     $self->{column_prev} = $self->{column};
7502     $self->{column}++;
7503     $self->{nc}
7504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7505     } else {
7506     $self->{set_nc}->($self);
7507     }
7508    
7509     return ($self->{ct}); # ATTLIST
7510     redo A;
7511     } elsif ($self->{nc} == -1) {
7512     ## XML5: No parse error.
7513     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7514     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7515     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7516    
7517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7518     $self->{line_prev} = $self->{line};
7519     $self->{column_prev} = $self->{column};
7520     $self->{column}++;
7521     $self->{nc}
7522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7523     } else {
7524     $self->{set_nc}->($self);
7525     }
7526    
7527     return ($self->{ct});
7528     redo A;
7529     } else {
7530     ## XML5: Not defined yet.
7531     if ($self->{ca}->{default} eq 'FIXED') {
7532     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7533     } else {
7534     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7535     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7536     }
7537     ## Reconsume.
7538     redo A;
7539     }
7540     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7541     if ($is_space->{$self->{nc}} or
7542     $self->{nc} == -1 or
7543     $self->{nc} == 0x003E) { # >
7544     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7545     ## Reconsume.
7546     redo A;
7547     } else {
7548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7549     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7550     ## Reconsume.
7551     redo A;
7552 wakaba 1.16 }
7553 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7554     ## ASCII case-insensitive
7555     if ($self->{nc} == [
7556     undef,
7557     0x0044, # D
7558     0x0041, # A
7559     0x0054, # T
7560     ]->[length $self->{kwd}] or
7561     $self->{nc} == [
7562     undef,
7563     0x0064, # d
7564     0x0061, # a
7565     0x0074, # t
7566     ]->[length $self->{kwd}]) {
7567    
7568     ## Stay in the state.
7569     $self->{kwd} .= chr $self->{nc};
7570    
7571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572     $self->{line_prev} = $self->{line};
7573     $self->{column_prev} = $self->{column};
7574     $self->{column}++;
7575     $self->{nc}
7576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577     } else {
7578     $self->{set_nc}->($self);
7579     }
7580    
7581     redo A;
7582     } elsif ((length $self->{kwd}) == 4 and
7583     ($self->{nc} == 0x0041 or # A
7584     $self->{nc} == 0x0061)) { # a
7585     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7586    
7587     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7588     text => 'NDATA',
7589     line => $self->{line_prev},
7590     column => $self->{column_prev} - 4);
7591     } else {
7592    
7593     }
7594     $self->{state} = AFTER_NDATA_STATE;
7595    
7596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7597     $self->{line_prev} = $self->{line};
7598     $self->{column_prev} = $self->{column};
7599     $self->{column}++;
7600     $self->{nc}
7601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7602     } else {
7603     $self->{set_nc}->($self);
7604     }
7605    
7606     redo A;
7607     } else {
7608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7609     line => $self->{line_prev},
7610     column => $self->{column_prev} + 1
7611     - length $self->{kwd});
7612    
7613     $self->{state} = BOGUS_MD_STATE;
7614     ## Reconsume.
7615     redo A;
7616     }
7617     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7618     if ($is_space->{$self->{nc}}) {
7619     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7620    
7621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7622     $self->{line_prev} = $self->{line};
7623     $self->{column_prev} = $self->{column};
7624     $self->{column}++;
7625     $self->{nc}
7626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7627     } else {
7628     $self->{set_nc}->($self);
7629     }
7630    
7631     redo A;
7632     } elsif ($self->{nc} == 0x003E) { # >
7633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7634     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7635    
7636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7637     $self->{line_prev} = $self->{line};
7638     $self->{column_prev} = $self->{column};
7639     $self->{column}++;
7640     $self->{nc}
7641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7642     } else {
7643     $self->{set_nc}->($self);
7644     }
7645    
7646     return ($self->{ct}); # ENTITY
7647     redo A;
7648     } elsif ($self->{nc} == -1) {
7649     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7650     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7651    
7652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7653     $self->{line_prev} = $self->{line};
7654     $self->{column_prev} = $self->{column};
7655     $self->{column}++;
7656     $self->{nc}
7657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7658     } else {
7659     $self->{set_nc}->($self);
7660     }
7661    
7662     return ($self->{ct}); # ENTITY
7663     redo A;
7664     } else {
7665     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7666     line => $self->{line_prev},
7667     column => $self->{column_prev} + 1
7668     - length $self->{kwd});
7669     $self->{state} = BOGUS_MD_STATE;
7670     ## Reconsume.
7671     redo A;
7672     }
7673     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7674     if ($is_space->{$self->{nc}}) {
7675     ## Stay in the state.
7676    
7677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678     $self->{line_prev} = $self->{line};
7679     $self->{column_prev} = $self->{column};
7680     $self->{column}++;
7681     $self->{nc}
7682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683     } else {
7684     $self->{set_nc}->($self);
7685     }
7686    
7687     redo A;
7688     } elsif ($self->{nc} == 0x003E) { # >
7689     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7690     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7691    
7692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693     $self->{line_prev} = $self->{line};
7694     $self->{column_prev} = $self->{column};
7695     $self->{column}++;
7696     $self->{nc}
7697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698     } else {
7699     $self->{set_nc}->($self);
7700     }
7701    
7702     return ($self->{ct}); # ENTITY
7703     redo A;
7704     } elsif ($self->{nc} == -1) {
7705     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7707    
7708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709     $self->{line_prev} = $self->{line};
7710     $self->{column_prev} = $self->{column};
7711     $self->{column}++;
7712     $self->{nc}
7713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714     } else {
7715     $self->{set_nc}->($self);
7716     }
7717    
7718     return ($self->{ct}); # ENTITY
7719     redo A;
7720     } else {
7721     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7722     $self->{state} = NOTATION_NAME_STATE;
7723    
7724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725     $self->{line_prev} = $self->{line};
7726     $self->{column_prev} = $self->{column};
7727     $self->{column}++;
7728     $self->{nc}
7729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730     } else {
7731     $self->{set_nc}->($self);
7732     }
7733    
7734     redo A;
7735     }
7736     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7737     if ($is_space->{$self->{nc}}) {
7738 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7739 wakaba 1.18
7740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7741     $self->{line_prev} = $self->{line};
7742     $self->{column_prev} = $self->{column};
7743     $self->{column}++;
7744     $self->{nc}
7745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7746     } else {
7747     $self->{set_nc}->($self);
7748     }
7749    
7750     redo A;
7751     } elsif ($self->{nc} == 0x003E) { # >
7752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753    
7754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755     $self->{line_prev} = $self->{line};
7756     $self->{column_prev} = $self->{column};
7757     $self->{column}++;
7758     $self->{nc}
7759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760     } else {
7761     $self->{set_nc}->($self);
7762     }
7763    
7764     return ($self->{ct}); # ENTITY
7765     redo A;
7766     } elsif ($self->{nc} == -1) {
7767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7768     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7769    
7770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771     $self->{line_prev} = $self->{line};
7772     $self->{column_prev} = $self->{column};
7773     $self->{column}++;
7774     $self->{nc}
7775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776     } else {
7777     $self->{set_nc}->($self);
7778     }
7779    
7780     return ($self->{ct}); # ENTITY
7781     redo A;
7782     } else {
7783     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7784     ## Stay in the state.
7785    
7786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787     $self->{line_prev} = $self->{line};
7788     $self->{column_prev} = $self->{column};
7789     $self->{column}++;
7790     $self->{nc}
7791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792     } else {
7793     $self->{set_nc}->($self);
7794     }
7795    
7796     redo A;
7797     }
7798 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7799     if ($self->{nc} == 0x0022) { # "
7800 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7801 wakaba 1.19
7802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7803     $self->{line_prev} = $self->{line};
7804     $self->{column_prev} = $self->{column};
7805     $self->{column}++;
7806     $self->{nc}
7807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7808     } else {
7809     $self->{set_nc}->($self);
7810     }
7811    
7812     redo A;
7813     } elsif ($self->{nc} == 0x0026) { # &
7814     $self->{prev_state} = $self->{state};
7815     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7816     $self->{entity_add} = 0x0022; # "
7817    
7818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7819     $self->{line_prev} = $self->{line};
7820     $self->{column_prev} = $self->{column};
7821     $self->{column}++;
7822     $self->{nc}
7823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7824     } else {
7825     $self->{set_nc}->($self);
7826     }
7827    
7828     redo A;
7829     ## TODO: %
7830     } elsif ($self->{nc} == -1) {
7831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7833     ## Reconsume.
7834     return ($self->{ct}); # ENTITY
7835     redo A;
7836     } else {
7837     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7838    
7839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7840     $self->{line_prev} = $self->{line};
7841     $self->{column_prev} = $self->{column};
7842     $self->{column}++;
7843     $self->{nc}
7844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7845     } else {
7846     $self->{set_nc}->($self);
7847     }
7848    
7849     redo A;
7850     }
7851     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7852     if ($self->{nc} == 0x0027) { # '
7853 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7854 wakaba 1.19
7855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856     $self->{line_prev} = $self->{line};
7857     $self->{column_prev} = $self->{column};
7858     $self->{column}++;
7859     $self->{nc}
7860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861     } else {
7862     $self->{set_nc}->($self);
7863     }
7864    
7865     redo A;
7866     } elsif ($self->{nc} == 0x0026) { # &
7867     $self->{prev_state} = $self->{state};
7868     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7869     $self->{entity_add} = 0x0027; # '
7870    
7871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872     $self->{line_prev} = $self->{line};
7873     $self->{column_prev} = $self->{column};
7874     $self->{column}++;
7875     $self->{nc}
7876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877     } else {
7878     $self->{set_nc}->($self);
7879     }
7880    
7881     redo A;
7882     ## TODO: %
7883     } elsif ($self->{nc} == -1) {
7884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7886     ## Reconsume.
7887     return ($self->{ct}); # ENTITY
7888     redo A;
7889     } else {
7890     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7891    
7892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7893     $self->{line_prev} = $self->{line};
7894     $self->{column_prev} = $self->{column};
7895     $self->{column}++;
7896     $self->{nc}
7897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7898     } else {
7899     $self->{set_nc}->($self);
7900     }
7901    
7902     redo A;
7903     }
7904     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7905     if ($is_space->{$self->{nc}} or
7906     {
7907     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7908     $self->{entity_add} => 1,
7909     }->{$self->{nc}}) {
7910 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7911     line => $self->{line_prev},
7912     column => $self->{column_prev}
7913     + ($self->{nc} == -1 ? 1 : 0));
7914 wakaba 1.19 ## Don't consume
7915     ## Return nothing.
7916     #
7917     } elsif ($self->{nc} == 0x0023) { # #
7918     $self->{ca} = $self->{ct};
7919     $self->{state} = ENTITY_HASH_STATE;
7920     $self->{kwd} = '#';
7921    
7922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7923     $self->{line_prev} = $self->{line};
7924     $self->{column_prev} = $self->{column};
7925     $self->{column}++;
7926     $self->{nc}
7927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7928     } else {
7929     $self->{set_nc}->($self);
7930     }
7931    
7932     redo A;
7933     } else {
7934     #
7935     }
7936    
7937     $self->{ct}->{value} .= '&';
7938     $self->{state} = $self->{prev_state};
7939     ## Reconsume.
7940     redo A;
7941 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7942     if ($is_space->{$self->{nc}}) {
7943     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7944    
7945     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7946     $self->{line_prev} = $self->{line};
7947     $self->{column_prev} = $self->{column};
7948     $self->{column}++;
7949     $self->{nc}
7950     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7951     } else {
7952     $self->{set_nc}->($self);
7953     }
7954    
7955     redo A;
7956     } elsif ($self->{nc} == 0x0028) { # (
7957     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7958     $self->{ct}->{content} = ['('];
7959     $self->{group_depth} = 1;
7960    
7961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7962     $self->{line_prev} = $self->{line};
7963     $self->{column_prev} = $self->{column};
7964     $self->{column}++;
7965     $self->{nc}
7966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7967     } else {
7968     $self->{set_nc}->($self);
7969     }
7970    
7971     redo A;
7972     } elsif ($self->{nc} == 0x003E) { # >
7973     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7974     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7975    
7976     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7977     $self->{line_prev} = $self->{line};
7978     $self->{column_prev} = $self->{column};
7979     $self->{column}++;
7980     $self->{nc}
7981     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7982     } else {
7983     $self->{set_nc}->($self);
7984     }
7985    
7986     return ($self->{ct}); # ELEMENT
7987     redo A;
7988     } elsif ($self->{nc} == -1) {
7989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7990     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7991    
7992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7993     $self->{line_prev} = $self->{line};
7994     $self->{column_prev} = $self->{column};
7995     $self->{column}++;
7996     $self->{nc}
7997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7998     } else {
7999     $self->{set_nc}->($self);
8000     }
8001    
8002     return ($self->{ct}); # ELEMENT
8003     redo A;
8004     } else {
8005     $self->{ct}->{content} = [chr $self->{nc}];
8006     $self->{state} = CONTENT_KEYWORD_STATE;
8007    
8008     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009     $self->{line_prev} = $self->{line};
8010     $self->{column_prev} = $self->{column};
8011     $self->{column}++;
8012     $self->{nc}
8013     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014     } else {
8015     $self->{set_nc}->($self);
8016     }
8017    
8018     redo A;
8019     }
8020     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8021     if ($is_space->{$self->{nc}}) {
8022     $self->{state} = AFTER_MD_DEF_STATE;
8023    
8024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8025     $self->{line_prev} = $self->{line};
8026     $self->{column_prev} = $self->{column};
8027     $self->{column}++;
8028     $self->{nc}
8029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8030     } else {
8031     $self->{set_nc}->($self);
8032     }
8033    
8034     redo A;
8035     } elsif ($self->{nc} == 0x003E) { # >
8036     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8037    
8038     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039     $self->{line_prev} = $self->{line};
8040     $self->{column_prev} = $self->{column};
8041     $self->{column}++;
8042     $self->{nc}
8043     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044     } else {
8045     $self->{set_nc}->($self);
8046     }
8047    
8048     return ($self->{ct}); # ELEMENT
8049     redo A;
8050     } elsif ($self->{nc} == -1) {
8051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8052     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8053    
8054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055     $self->{line_prev} = $self->{line};
8056     $self->{column_prev} = $self->{column};
8057     $self->{column}++;
8058     $self->{nc}
8059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060     } else {
8061     $self->{set_nc}->($self);
8062     }
8063    
8064     return ($self->{ct}); # ELEMENT
8065     redo A;
8066     } else {
8067     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8068     ## Stay in the state.
8069    
8070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071     $self->{line_prev} = $self->{line};
8072     $self->{column_prev} = $self->{column};
8073     $self->{column}++;
8074     $self->{nc}
8075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076     } else {
8077     $self->{set_nc}->($self);
8078     }
8079    
8080     redo A;
8081     }
8082     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8083     if ($is_space->{$self->{nc}}) {
8084     ## Stay in the state.
8085    
8086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8087     $self->{line_prev} = $self->{line};
8088     $self->{column_prev} = $self->{column};
8089     $self->{column}++;
8090     $self->{nc}
8091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8092     } else {
8093     $self->{set_nc}->($self);
8094     }
8095    
8096     redo A;
8097     } elsif ($self->{nc} == 0x0028) { # (
8098     $self->{group_depth}++;
8099     push @{$self->{ct}->{content}}, chr $self->{nc};
8100     ## Stay in the state.
8101    
8102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8103     $self->{line_prev} = $self->{line};
8104     $self->{column_prev} = $self->{column};
8105     $self->{column}++;
8106     $self->{nc}
8107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8108     } else {
8109     $self->{set_nc}->($self);
8110     }
8111    
8112     redo A;
8113     } elsif ($self->{nc} == 0x007C or # |
8114     $self->{nc} == 0x002C) { # ,
8115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8116     ## Stay in the state.
8117    
8118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8119     $self->{line_prev} = $self->{line};
8120     $self->{column_prev} = $self->{column};
8121     $self->{column}++;
8122     $self->{nc}
8123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8124     } else {
8125     $self->{set_nc}->($self);
8126     }
8127    
8128     redo A;
8129     } elsif ($self->{nc} == 0x0029) { # )
8130     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8131     push @{$self->{ct}->{content}}, chr $self->{nc};
8132     $self->{group_depth}--;
8133     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8134    
8135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8136     $self->{line_prev} = $self->{line};
8137     $self->{column_prev} = $self->{column};
8138     $self->{column}++;
8139     $self->{nc}
8140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8141     } else {
8142     $self->{set_nc}->($self);
8143     }
8144    
8145     redo A;
8146     } elsif ($self->{nc} == 0x003E) { # >
8147     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8148     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8149     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8150    
8151     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8152     $self->{line_prev} = $self->{line};
8153     $self->{column_prev} = $self->{column};
8154     $self->{column}++;
8155     $self->{nc}
8156     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8157     } else {
8158     $self->{set_nc}->($self);
8159     }
8160    
8161     return ($self->{ct}); # ELEMENT
8162     redo A;
8163     } elsif ($self->{nc} == -1) {
8164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8165     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8166     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8167    
8168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8169     $self->{line_prev} = $self->{line};
8170     $self->{column_prev} = $self->{column};
8171     $self->{column}++;
8172     $self->{nc}
8173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8174     } else {
8175     $self->{set_nc}->($self);
8176     }
8177    
8178     return ($self->{ct}); # ELEMENT
8179     redo A;
8180     } else {
8181     push @{$self->{ct}->{content}}, chr $self->{nc};
8182     $self->{state} = CM_ELEMENT_NAME_STATE;
8183    
8184     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8185     $self->{line_prev} = $self->{line};
8186     $self->{column_prev} = $self->{column};
8187     $self->{column}++;
8188     $self->{nc}
8189     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8190     } else {
8191     $self->{set_nc}->($self);
8192     }
8193    
8194     redo A;
8195     }
8196     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8197     if ($is_space->{$self->{nc}}) {
8198     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8199    
8200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201     $self->{line_prev} = $self->{line};
8202     $self->{column_prev} = $self->{column};
8203     $self->{column}++;
8204     $self->{nc}
8205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206     } else {
8207     $self->{set_nc}->($self);
8208     }
8209    
8210     redo A;
8211     } elsif ($self->{nc} == 0x002A or # *
8212     $self->{nc} == 0x002B or # +
8213     $self->{nc} == 0x003F) { # ?
8214     push @{$self->{ct}->{content}}, chr $self->{nc};
8215     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8216    
8217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8218     $self->{line_prev} = $self->{line};
8219     $self->{column_prev} = $self->{column};
8220     $self->{column}++;
8221     $self->{nc}
8222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8223     } else {
8224     $self->{set_nc}->($self);
8225     }
8226    
8227     redo A;
8228     } elsif ($self->{nc} == 0x007C or # |
8229     $self->{nc} == 0x002C) { # ,
8230     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8231     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8232    
8233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8234     $self->{line_prev} = $self->{line};
8235     $self->{column_prev} = $self->{column};
8236     $self->{column}++;
8237     $self->{nc}
8238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8239     } else {
8240     $self->{set_nc}->($self);
8241     }
8242    
8243     redo A;
8244     } elsif ($self->{nc} == 0x0029) { # )
8245     $self->{group_depth}--;
8246     push @{$self->{ct}->{content}}, chr $self->{nc};
8247     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8248    
8249     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8250     $self->{line_prev} = $self->{line};
8251     $self->{column_prev} = $self->{column};
8252     $self->{column}++;
8253     $self->{nc}
8254     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8255     } else {
8256     $self->{set_nc}->($self);
8257     }
8258    
8259     redo A;
8260     } elsif ($self->{nc} == 0x003E) { # >
8261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8262     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8263     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8264    
8265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8266     $self->{line_prev} = $self->{line};
8267     $self->{column_prev} = $self->{column};
8268     $self->{column}++;
8269     $self->{nc}
8270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8271     } else {
8272     $self->{set_nc}->($self);
8273     }
8274    
8275     return ($self->{ct}); # ELEMENT
8276     redo A;
8277     } elsif ($self->{nc} == -1) {
8278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8279     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8280     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8281    
8282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8283     $self->{line_prev} = $self->{line};
8284     $self->{column_prev} = $self->{column};
8285     $self->{column}++;
8286     $self->{nc}
8287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8288     } else {
8289     $self->{set_nc}->($self);
8290     }
8291    
8292     return ($self->{ct}); # ELEMENT
8293     redo A;
8294     } else {
8295     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8296     ## Stay in the state.
8297    
8298     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8299     $self->{line_prev} = $self->{line};
8300     $self->{column_prev} = $self->{column};
8301     $self->{column}++;
8302     $self->{nc}
8303     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8304     } else {
8305     $self->{set_nc}->($self);
8306     }
8307    
8308     redo A;
8309     }
8310     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8311     if ($is_space->{$self->{nc}}) {
8312     ## Stay in the state.
8313    
8314     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8315     $self->{line_prev} = $self->{line};
8316     $self->{column_prev} = $self->{column};
8317     $self->{column}++;
8318     $self->{nc}
8319     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8320     } else {
8321     $self->{set_nc}->($self);
8322     }
8323    
8324     redo A;
8325     } elsif ($self->{nc} == 0x007C or # |
8326     $self->{nc} == 0x002C) { # ,
8327     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8328     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8329    
8330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8331     $self->{line_prev} = $self->{line};
8332     $self->{column_prev} = $self->{column};
8333     $self->{column}++;
8334     $self->{nc}
8335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8336     } else {
8337     $self->{set_nc}->($self);
8338     }
8339    
8340     redo A;
8341     } elsif ($self->{nc} == 0x0029) { # )
8342     $self->{group_depth}--;
8343     push @{$self->{ct}->{content}}, chr $self->{nc};
8344     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8345    
8346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8347     $self->{line_prev} = $self->{line};
8348     $self->{column_prev} = $self->{column};
8349     $self->{column}++;
8350     $self->{nc}
8351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8352     } else {
8353     $self->{set_nc}->($self);
8354     }
8355    
8356     redo A;
8357     } elsif ($self->{nc} == 0x003E) { # >
8358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8359     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8360     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8361    
8362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8363     $self->{line_prev} = $self->{line};
8364     $self->{column_prev} = $self->{column};
8365     $self->{column}++;
8366     $self->{nc}
8367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8368     } else {
8369     $self->{set_nc}->($self);
8370     }
8371    
8372     return ($self->{ct}); # ELEMENT
8373     redo A;
8374     } elsif ($self->{nc} == -1) {
8375     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8376     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8377     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8378    
8379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8380     $self->{line_prev} = $self->{line};
8381     $self->{column_prev} = $self->{column};
8382     $self->{column}++;
8383     $self->{nc}
8384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8385     } else {
8386     $self->{set_nc}->($self);
8387     }
8388    
8389     return ($self->{ct}); # ELEMENT
8390     redo A;
8391     } else {
8392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8393     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8394     $self->{state} = BOGUS_MD_STATE;
8395    
8396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8397     $self->{line_prev} = $self->{line};
8398     $self->{column_prev} = $self->{column};
8399     $self->{column}++;
8400     $self->{nc}
8401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8402     } else {
8403     $self->{set_nc}->($self);
8404     }
8405    
8406     redo A;
8407     }
8408     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8409     if ($is_space->{$self->{nc}}) {
8410     if ($self->{group_depth}) {
8411     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8412     } else {
8413     $self->{state} = AFTER_MD_DEF_STATE;
8414     }
8415    
8416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8417     $self->{line_prev} = $self->{line};
8418     $self->{column_prev} = $self->{column};
8419     $self->{column}++;
8420     $self->{nc}
8421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8422     } else {
8423     $self->{set_nc}->($self);
8424     }
8425    
8426     redo A;
8427     } elsif ($self->{nc} == 0x002A or # *
8428     $self->{nc} == 0x002B or # +
8429     $self->{nc} == 0x003F) { # ?
8430     push @{$self->{ct}->{content}}, chr $self->{nc};
8431     if ($self->{group_depth}) {
8432     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8433     } else {
8434     $self->{state} = AFTER_MD_DEF_STATE;
8435     }
8436    
8437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8438     $self->{line_prev} = $self->{line};
8439     $self->{column_prev} = $self->{column};
8440     $self->{column}++;
8441     $self->{nc}
8442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8443     } else {
8444     $self->{set_nc}->($self);
8445     }
8446    
8447     redo A;
8448     } elsif ($self->{nc} == 0x0029) { # )
8449     if ($self->{group_depth}) {
8450     $self->{group_depth}--;
8451     push @{$self->{ct}->{content}}, chr $self->{nc};
8452     ## Stay in the state.
8453    
8454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455     $self->{line_prev} = $self->{line};
8456     $self->{column_prev} = $self->{column};
8457     $self->{column}++;
8458     $self->{nc}
8459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460     } else {
8461     $self->{set_nc}->($self);
8462     }
8463    
8464     redo A;
8465     } else {
8466     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8467     $self->{state} = BOGUS_MD_STATE;
8468     ## Reconsume.
8469     redo A;
8470     }
8471     } elsif ($self->{nc} == 0x003E) { # >
8472     if ($self->{group_depth}) {
8473     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8474     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8475     }
8476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8477    
8478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8479     $self->{line_prev} = $self->{line};
8480     $self->{column_prev} = $self->{column};
8481     $self->{column}++;
8482     $self->{nc}
8483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8484     } else {
8485     $self->{set_nc}->($self);
8486     }
8487    
8488     return ($self->{ct}); # ELEMENT
8489     redo A;
8490     } elsif ($self->{nc} == -1) {
8491     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8492     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8493     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8494    
8495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8496     $self->{line_prev} = $self->{line};
8497     $self->{column_prev} = $self->{column};
8498     $self->{column}++;
8499     $self->{nc}
8500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8501     } else {
8502     $self->{set_nc}->($self);
8503     }
8504    
8505     return ($self->{ct}); # ELEMENT
8506     redo A;
8507     } else {
8508     if ($self->{group_depth}) {
8509     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8510     } else {
8511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8512     $self->{state} = BOGUS_MD_STATE;
8513     }
8514     ## Reconsume.
8515     redo A;
8516     }
8517     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8518 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8519     ## Stay in the state.
8520    
8521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8522     $self->{line_prev} = $self->{line};
8523     $self->{column_prev} = $self->{column};
8524     $self->{column}++;
8525     $self->{nc}
8526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8527     } else {
8528     $self->{set_nc}->($self);
8529     }
8530    
8531     redo A;
8532     } elsif ($self->{nc} == 0x003E) { # >
8533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8534    
8535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8536     $self->{line_prev} = $self->{line};
8537     $self->{column_prev} = $self->{column};
8538     $self->{column}++;
8539     $self->{nc}
8540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8541     } else {
8542     $self->{set_nc}->($self);
8543     }
8544    
8545 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8546 wakaba 1.18 redo A;
8547     } elsif ($self->{nc} == -1) {
8548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8550    
8551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8552     $self->{line_prev} = $self->{line};
8553     $self->{column_prev} = $self->{column};
8554     $self->{column}++;
8555     $self->{nc}
8556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8557     } else {
8558     $self->{set_nc}->($self);
8559     }
8560    
8561 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8562 wakaba 1.18 redo A;
8563     } else {
8564 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8565 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8566     ## Reconsume.
8567     redo A;
8568     }
8569 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8570     if ($self->{nc} == 0x003E) { # >
8571     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8572    
8573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8574     $self->{line_prev} = $self->{line};
8575     $self->{column_prev} = $self->{column};
8576     $self->{column}++;
8577     $self->{nc}
8578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8579     } else {
8580     $self->{set_nc}->($self);
8581     }
8582    
8583     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8584     redo A;
8585     } elsif ($self->{nc} == -1) {
8586     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8587     ## Reconsume.
8588     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8589     redo A;
8590     } else {
8591     ## Stay in the state.
8592    
8593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8594     $self->{line_prev} = $self->{line};
8595     $self->{column_prev} = $self->{column};
8596     $self->{column}++;
8597     $self->{nc}
8598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8599     } else {
8600     $self->{set_nc}->($self);
8601     }
8602    
8603     redo A;
8604     }
8605 wakaba 1.1 } else {
8606     die "$0: $self->{state}: Unknown state";
8607     }
8608     } # A
8609    
8610     die "$0: _get_next_token: unexpected case";
8611     } # _get_next_token
8612    
8613     1;
8614 wakaba 1.22 ## $Date: 2008/10/19 09:25:21 $
8615 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24