/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.28 - (hide annotations) (download)
Sun Jul 5 04:38:45 2009 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.27: +13 -3 lines
++ whatpm/t/ChangeLog	5 Jul 2009 04:38:11 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Updated the result (c.f. HTML5 revision
	3121).

++ whatpm/Whatpm/HTML/ChangeLog	5 Jul 2009 04:38:33 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Reduced the number of parse errors on broken
	DOCTYPE (HTML5 revision 3121).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.28 our $VERSION=do{my @r=(q$Revision: 1.27 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744 wakaba 1.1
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819 wakaba 1.25 } elsif ($self->{is_xml} and
1820     $is_space->{$self->{nc}}) {
1821    
1822     $self->{ca}->{value} .= ' ';
1823     ## Stay in the state.
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836 wakaba 1.1 } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 wakaba 1.15
1842     $self->{state} = DATA_STATE;
1843     $self->{s_kwd} = '';
1844     ## reconsume
1845     return ($self->{ct}); # start tag
1846     redo A;
1847 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849     if ($self->{ct}->{attributes}) {
1850    
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852     } else {
1853     ## NOTE: This state should never be reached.
1854    
1855     }
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # end tag
1861     redo A;
1862     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863     ## XML5: No parse error above; not defined yet.
1864     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866     ## Reconsume.
1867     return ($self->{ct}); # ATTLIST
1868     redo A;
1869 wakaba 1.1 } else {
1870     die "$0: $self->{ct}->{type}: Unknown token type";
1871     }
1872     } else {
1873 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1874 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875    
1876     ## XML5: Not a parse error.
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878     } else {
1879    
1880     }
1881 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1882     $self->{read_until}->($self->{ca}->{value},
1883 wakaba 1.25 qq["&<\x09\x0C\x20],
1884 wakaba 1.1 length $self->{ca}->{value});
1885    
1886     ## Stay in the state
1887    
1888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889     $self->{line_prev} = $self->{line};
1890     $self->{column_prev} = $self->{column};
1891     $self->{column}++;
1892     $self->{nc}
1893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894     } else {
1895     $self->{set_nc}->($self);
1896     }
1897    
1898     redo A;
1899     }
1900     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902     ## ATTLIST attribute value single quoted state".
1903 wakaba 1.11
1904 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1905 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906    
1907     ## XML5: "DOCTYPE ATTLIST name after state".
1908     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910     } else {
1911    
1912     ## XML5: "Before attribute name state" (sic).
1913     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914     }
1915 wakaba 1.1
1916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917     $self->{line_prev} = $self->{line};
1918     $self->{column_prev} = $self->{column};
1919     $self->{column}++;
1920     $self->{nc}
1921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922     } else {
1923     $self->{set_nc}->($self);
1924     }
1925    
1926     redo A;
1927     } elsif ($self->{nc} == 0x0026) { # &
1928    
1929 wakaba 1.11 ## XML5: Not defined yet.
1930    
1931 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1932     ## "entity in attribute value state". In this implementation, the
1933     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934     ## implementation of the "consume a character reference" algorithm.
1935     $self->{entity_add} = 0x0027; # '
1936     $self->{prev_state} = $self->{state};
1937     $self->{state} = ENTITY_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949     redo A;
1950 wakaba 1.25 } elsif ($self->{is_xml} and
1951     $is_space->{$self->{nc}}) {
1952    
1953     $self->{ca}->{value} .= ' ';
1954     ## Stay in the state.
1955    
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966     redo A;
1967 wakaba 1.1 } elsif ($self->{nc} == -1) {
1968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972 wakaba 1.15
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     ## reconsume
1976     return ($self->{ct}); # start tag
1977     redo A;
1978 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980     if ($self->{ct}->{attributes}) {
1981    
1982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983     } else {
1984     ## NOTE: This state should never be reached.
1985    
1986     }
1987 wakaba 1.15
1988     $self->{state} = DATA_STATE;
1989     $self->{s_kwd} = '';
1990     ## reconsume
1991     return ($self->{ct}); # end tag
1992     redo A;
1993     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994     ## XML5: No parse error above; not defined yet.
1995     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     ## Reconsume.
1998     return ($self->{ct}); # ATTLIST
1999     redo A;
2000 wakaba 1.1 } else {
2001     die "$0: $self->{ct}->{type}: Unknown token type";
2002     }
2003     } else {
2004 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2005 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006    
2007     ## XML5: Not a parse error.
2008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009     } else {
2010    
2011     }
2012 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2013     $self->{read_until}->($self->{ca}->{value},
2014 wakaba 1.25 qq['&<\x09\x0C\x20],
2015 wakaba 1.1 length $self->{ca}->{value});
2016    
2017     ## Stay in the state
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     }
2031     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2033    
2034 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2035 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036    
2037     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039     } else {
2040    
2041     ## XML5: "Tag attribute name before state".
2042     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043     }
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055     redo A;
2056     } elsif ($self->{nc} == 0x0026) { # &
2057    
2058 wakaba 1.11
2059     ## XML5: Not defined yet.
2060    
2061 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2062     ## "entity in attribute value state". In this implementation, the
2063     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064     ## implementation of the "consume a character reference" algorithm.
2065     $self->{entity_add} = -1;
2066     $self->{prev_state} = $self->{state};
2067     $self->{state} = ENTITY_STATE;
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x003E) { # >
2081     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082    
2083     $self->{last_stag_name} = $self->{ct}->{tag_name};
2084 wakaba 1.15
2085     $self->{state} = DATA_STATE;
2086     $self->{s_kwd} = '';
2087    
2088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098     return ($self->{ct}); # start tag
2099     redo A;
2100 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102     if ($self->{ct}->{attributes}) {
2103    
2104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105     } else {
2106     ## NOTE: This state should never be reached.
2107    
2108     }
2109 wakaba 1.15
2110     $self->{state} = DATA_STATE;
2111     $self->{s_kwd} = '';
2112    
2113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114     $self->{line_prev} = $self->{line};
2115     $self->{column_prev} = $self->{column};
2116     $self->{column}++;
2117     $self->{nc}
2118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119     } else {
2120     $self->{set_nc}->($self);
2121     }
2122    
2123     return ($self->{ct}); # end tag
2124     redo A;
2125     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128    
2129 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130     $self->{line_prev} = $self->{line};
2131     $self->{column_prev} = $self->{column};
2132     $self->{column}++;
2133     $self->{nc}
2134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135     } else {
2136     $self->{set_nc}->($self);
2137     }
2138    
2139 wakaba 1.15 return ($self->{ct}); # ATTLIST
2140     redo A;
2141     } else {
2142     die "$0: $self->{ct}->{type}: Unknown token type";
2143     }
2144 wakaba 1.1 } elsif ($self->{nc} == -1) {
2145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146    
2147 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149 wakaba 1.15
2150     $self->{state} = DATA_STATE;
2151     $self->{s_kwd} = '';
2152     ## reconsume
2153     return ($self->{ct}); # start tag
2154     redo A;
2155 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158     if ($self->{ct}->{attributes}) {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161     } else {
2162     ## NOTE: This state should never be reached.
2163    
2164     }
2165 wakaba 1.15
2166     $self->{state} = DATA_STATE;
2167     $self->{s_kwd} = '';
2168     ## reconsume
2169     return ($self->{ct}); # end tag
2170     redo A;
2171     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175     ## Reconsume.
2176     return ($self->{ct}); # ATTLIST
2177     redo A;
2178 wakaba 1.1 } else {
2179     die "$0: $self->{ct}->{type}: Unknown token type";
2180     }
2181     } else {
2182     if ({
2183     0x0022 => 1, # "
2184     0x0027 => 1, # '
2185     0x003D => 1, # =
2186 wakaba 1.26 0x003C => 1, # <
2187 wakaba 1.1 }->{$self->{nc}}) {
2188    
2189 wakaba 1.11 ## XML5: Not a parse error.
2190 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2191     } else {
2192    
2193     }
2194     $self->{ca}->{value} .= chr ($self->{nc});
2195     $self->{read_until}->($self->{ca}->{value},
2196 wakaba 1.25 qq["'=& \x09\x0C>],
2197 wakaba 1.1 length $self->{ca}->{value});
2198    
2199     ## Stay in the state
2200    
2201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2202     $self->{line_prev} = $self->{line};
2203     $self->{column_prev} = $self->{column};
2204     $self->{column}++;
2205     $self->{nc}
2206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2207     } else {
2208     $self->{set_nc}->($self);
2209     }
2210    
2211     redo A;
2212     }
2213     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2214     if ($is_space->{$self->{nc}}) {
2215    
2216     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2217    
2218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219     $self->{line_prev} = $self->{line};
2220     $self->{column_prev} = $self->{column};
2221     $self->{column}++;
2222     $self->{nc}
2223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224     } else {
2225     $self->{set_nc}->($self);
2226     }
2227    
2228     redo A;
2229     } elsif ($self->{nc} == 0x003E) { # >
2230     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2231    
2232     $self->{last_stag_name} = $self->{ct}->{tag_name};
2233     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2234     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2235     if ($self->{ct}->{attributes}) {
2236    
2237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2238     } else {
2239     ## NOTE: This state should never be reached.
2240    
2241     }
2242     } else {
2243     die "$0: $self->{ct}->{type}: Unknown token type";
2244     }
2245     $self->{state} = DATA_STATE;
2246 wakaba 1.5 $self->{s_kwd} = '';
2247 wakaba 1.1
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258    
2259     return ($self->{ct}); # start tag or end tag
2260    
2261     redo A;
2262     } elsif ($self->{nc} == 0x002F) { # /
2263    
2264     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2265    
2266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2267     $self->{line_prev} = $self->{line};
2268     $self->{column_prev} = $self->{column};
2269     $self->{column}++;
2270     $self->{nc}
2271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2272     } else {
2273     $self->{set_nc}->($self);
2274     }
2275    
2276     redo A;
2277     } elsif ($self->{nc} == -1) {
2278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2279     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2280    
2281     $self->{last_stag_name} = $self->{ct}->{tag_name};
2282     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2283     if ($self->{ct}->{attributes}) {
2284    
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2286     } else {
2287     ## NOTE: This state should never be reached.
2288    
2289     }
2290     } else {
2291     die "$0: $self->{ct}->{type}: Unknown token type";
2292     }
2293     $self->{state} = DATA_STATE;
2294 wakaba 1.5 $self->{s_kwd} = '';
2295 wakaba 1.1 ## Reconsume.
2296     return ($self->{ct}); # start tag or end tag
2297     redo A;
2298     } else {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2301     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2302     ## reconsume
2303     redo A;
2304     }
2305     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2306 wakaba 1.11 ## XML5: "Empty tag state".
2307    
2308 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2309     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2312     ## TODO: Different type than slash in start tag
2313     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318    
2319     }
2320     ## TODO: Test |<title></title/>|
2321     } else {
2322    
2323     $self->{self_closing} = 1;
2324     }
2325    
2326     $self->{state} = DATA_STATE;
2327 wakaba 1.5 $self->{s_kwd} = '';
2328 wakaba 1.1
2329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2330     $self->{line_prev} = $self->{line};
2331     $self->{column_prev} = $self->{column};
2332     $self->{column}++;
2333     $self->{nc}
2334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2335     } else {
2336     $self->{set_nc}->($self);
2337     }
2338    
2339    
2340     return ($self->{ct}); # start tag or end tag
2341    
2342     redo A;
2343     } elsif ($self->{nc} == -1) {
2344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2345     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2346    
2347     $self->{last_stag_name} = $self->{ct}->{tag_name};
2348     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2349     if ($self->{ct}->{attributes}) {
2350    
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2352     } else {
2353     ## NOTE: This state should never be reached.
2354    
2355     }
2356     } else {
2357     die "$0: $self->{ct}->{type}: Unknown token type";
2358     }
2359 wakaba 1.11 ## XML5: "Tag attribute name before state".
2360 wakaba 1.1 $self->{state} = DATA_STATE;
2361 wakaba 1.5 $self->{s_kwd} = '';
2362 wakaba 1.1 ## Reconsume.
2363     return ($self->{ct}); # start tag or end tag
2364     redo A;
2365     } else {
2366    
2367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2368     ## TODO: This error type is wrong.
2369     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2370     ## Reconsume.
2371     redo A;
2372     }
2373     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2374 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2375    
2376 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2377     ## consumes characters one-by-one basis.
2378    
2379     if ($self->{nc} == 0x003E) { # >
2380 wakaba 1.13 if ($self->{in_subset}) {
2381    
2382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2383     } else {
2384    
2385     $self->{state} = DATA_STATE;
2386     $self->{s_kwd} = '';
2387     }
2388 wakaba 1.1
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399    
2400     return ($self->{ct}); # comment
2401     redo A;
2402     } elsif ($self->{nc} == -1) {
2403 wakaba 1.13 if ($self->{in_subset}) {
2404    
2405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2406     } else {
2407    
2408     $self->{state} = DATA_STATE;
2409     $self->{s_kwd} = '';
2410     }
2411 wakaba 1.1 ## reconsume
2412    
2413     return ($self->{ct}); # comment
2414     redo A;
2415     } else {
2416    
2417     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2418     $self->{read_until}->($self->{ct}->{data},
2419     q[>],
2420     length $self->{ct}->{data});
2421    
2422     ## Stay in the state.
2423    
2424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425     $self->{line_prev} = $self->{line};
2426     $self->{column_prev} = $self->{column};
2427     $self->{column}++;
2428     $self->{nc}
2429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430     } else {
2431     $self->{set_nc}->($self);
2432     }
2433    
2434     redo A;
2435     }
2436     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2437 wakaba 1.14 ## XML5: "Markup declaration state".
2438 wakaba 1.1
2439     if ($self->{nc} == 0x002D) { # -
2440    
2441     $self->{state} = MD_HYPHEN_STATE;
2442    
2443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2444     $self->{line_prev} = $self->{line};
2445     $self->{column_prev} = $self->{column};
2446     $self->{column}++;
2447     $self->{nc}
2448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2449     } else {
2450     $self->{set_nc}->($self);
2451     }
2452    
2453     redo A;
2454     } elsif ($self->{nc} == 0x0044 or # D
2455     $self->{nc} == 0x0064) { # d
2456     ## ASCII case-insensitive.
2457    
2458     $self->{state} = MD_DOCTYPE_STATE;
2459 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2460 wakaba 1.1
2461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2462     $self->{line_prev} = $self->{line};
2463     $self->{column_prev} = $self->{column};
2464     $self->{column}++;
2465     $self->{nc}
2466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2467     } else {
2468     $self->{set_nc}->($self);
2469     }
2470    
2471     redo A;
2472 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2473     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2474     $self->{is_xml}) and
2475 wakaba 1.1 $self->{nc} == 0x005B) { # [
2476    
2477     $self->{state} = MD_CDATA_STATE;
2478 wakaba 1.12 $self->{kwd} = '[';
2479 wakaba 1.1
2480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481     $self->{line_prev} = $self->{line};
2482     $self->{column_prev} = $self->{column};
2483     $self->{column}++;
2484     $self->{nc}
2485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486     } else {
2487     $self->{set_nc}->($self);
2488     }
2489    
2490     redo A;
2491     } else {
2492    
2493     }
2494    
2495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2496     line => $self->{line_prev},
2497     column => $self->{column_prev} - 1);
2498     ## Reconsume.
2499     $self->{state} = BOGUS_COMMENT_STATE;
2500     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2501     line => $self->{line_prev},
2502     column => $self->{column_prev} - 1,
2503     };
2504     redo A;
2505     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2506     if ($self->{nc} == 0x002D) { # -
2507    
2508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509     line => $self->{line_prev},
2510     column => $self->{column_prev} - 2,
2511     };
2512 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2513 wakaba 1.1
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } else {
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2528     line => $self->{line_prev},
2529     column => $self->{column_prev} - 2);
2530     $self->{state} = BOGUS_COMMENT_STATE;
2531     ## Reconsume.
2532     $self->{ct} = {type => COMMENT_TOKEN,
2533     data => '-',
2534     line => $self->{line_prev},
2535     column => $self->{column_prev} - 2,
2536     };
2537     redo A;
2538     }
2539     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2540     ## ASCII case-insensitive.
2541     if ($self->{nc} == [
2542     undef,
2543     0x004F, # O
2544     0x0043, # C
2545     0x0054, # T
2546     0x0059, # Y
2547     0x0050, # P
2548 wakaba 1.12 ]->[length $self->{kwd}] or
2549 wakaba 1.1 $self->{nc} == [
2550     undef,
2551     0x006F, # o
2552     0x0063, # c
2553     0x0074, # t
2554     0x0079, # y
2555     0x0070, # p
2556 wakaba 1.12 ]->[length $self->{kwd}]) {
2557 wakaba 1.1
2558     ## Stay in the state.
2559 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2560 wakaba 1.1
2561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562     $self->{line_prev} = $self->{line};
2563     $self->{column_prev} = $self->{column};
2564     $self->{column}++;
2565     $self->{nc}
2566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567     } else {
2568     $self->{set_nc}->($self);
2569     }
2570    
2571     redo A;
2572 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2573 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2574     $self->{nc} == 0x0065)) { # e
2575 wakaba 1.12 if ($self->{is_xml} and
2576     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2577 wakaba 1.10
2578     ## XML5: case-sensitive.
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2580     text => 'DOCTYPE',
2581     line => $self->{line_prev},
2582     column => $self->{column_prev} - 5);
2583     } else {
2584    
2585     }
2586 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2587     $self->{ct} = {type => DOCTYPE_TOKEN,
2588     quirks => 1,
2589     line => $self->{line_prev},
2590     column => $self->{column_prev} - 7,
2591     };
2592    
2593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2594     $self->{line_prev} = $self->{line};
2595     $self->{column_prev} = $self->{column};
2596     $self->{column}++;
2597     $self->{nc}
2598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2599     } else {
2600     $self->{set_nc}->($self);
2601     }
2602    
2603     redo A;
2604     } else {
2605    
2606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2607     line => $self->{line_prev},
2608 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2609 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2610     ## Reconsume.
2611     $self->{ct} = {type => COMMENT_TOKEN,
2612 wakaba 1.12 data => $self->{kwd},
2613 wakaba 1.1 line => $self->{line_prev},
2614 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2615 wakaba 1.1 };
2616     redo A;
2617     }
2618     } elsif ($self->{state} == MD_CDATA_STATE) {
2619     if ($self->{nc} == {
2620     '[' => 0x0043, # C
2621     '[C' => 0x0044, # D
2622     '[CD' => 0x0041, # A
2623     '[CDA' => 0x0054, # T
2624     '[CDAT' => 0x0041, # A
2625 wakaba 1.12 }->{$self->{kwd}}) {
2626 wakaba 1.1
2627     ## Stay in the state.
2628 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2629 wakaba 1.1
2630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631     $self->{line_prev} = $self->{line};
2632     $self->{column_prev} = $self->{column};
2633     $self->{column}++;
2634     $self->{nc}
2635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2636     } else {
2637     $self->{set_nc}->($self);
2638     }
2639    
2640     redo A;
2641 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2642 wakaba 1.1 $self->{nc} == 0x005B) { # [
2643 wakaba 1.6 if ($self->{is_xml} and
2644     not $self->{tainted} and
2645     @{$self->{open_elements} or []} == 0) {
2646 wakaba 1.8
2647 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2648     line => $self->{line_prev},
2649     column => $self->{column_prev} - 7);
2650     $self->{tainted} = 1;
2651 wakaba 1.8 } else {
2652    
2653 wakaba 1.6 }
2654    
2655 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2656     data => '',
2657     line => $self->{line_prev},
2658     column => $self->{column_prev} - 7};
2659     $self->{state} = CDATA_SECTION_STATE;
2660    
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671     redo A;
2672     } else {
2673    
2674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2675     line => $self->{line_prev},
2676 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2677 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2678     ## Reconsume.
2679     $self->{ct} = {type => COMMENT_TOKEN,
2680 wakaba 1.12 data => $self->{kwd},
2681 wakaba 1.1 line => $self->{line_prev},
2682 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2683 wakaba 1.1 };
2684     redo A;
2685     }
2686     } elsif ($self->{state} == COMMENT_START_STATE) {
2687     if ($self->{nc} == 0x002D) { # -
2688    
2689     $self->{state} = COMMENT_START_DASH_STATE;
2690    
2691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2692     $self->{line_prev} = $self->{line};
2693     $self->{column_prev} = $self->{column};
2694     $self->{column}++;
2695     $self->{nc}
2696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2697     } else {
2698     $self->{set_nc}->($self);
2699     }
2700    
2701     redo A;
2702     } elsif ($self->{nc} == 0x003E) { # >
2703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2704 wakaba 1.13 if ($self->{in_subset}) {
2705    
2706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707     } else {
2708    
2709     $self->{state} = DATA_STATE;
2710     $self->{s_kwd} = '';
2711     }
2712 wakaba 1.1
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723    
2724     return ($self->{ct}); # comment
2725    
2726     redo A;
2727     } elsif ($self->{nc} == -1) {
2728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729 wakaba 1.13 if ($self->{in_subset}) {
2730    
2731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732     } else {
2733    
2734     $self->{state} = DATA_STATE;
2735     $self->{s_kwd} = '';
2736     }
2737 wakaba 1.1 ## reconsume
2738    
2739     return ($self->{ct}); # comment
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{ct}->{data} # comment
2745     .= chr ($self->{nc});
2746     $self->{state} = COMMENT_STATE;
2747    
2748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749     $self->{line_prev} = $self->{line};
2750     $self->{column_prev} = $self->{column};
2751     $self->{column}++;
2752     $self->{nc}
2753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754     } else {
2755     $self->{set_nc}->($self);
2756     }
2757    
2758     redo A;
2759     }
2760     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2761     if ($self->{nc} == 0x002D) { # -
2762    
2763     $self->{state} = COMMENT_END_STATE;
2764    
2765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2766     $self->{line_prev} = $self->{line};
2767     $self->{column_prev} = $self->{column};
2768     $self->{column}++;
2769     $self->{nc}
2770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2771     } else {
2772     $self->{set_nc}->($self);
2773     }
2774    
2775     redo A;
2776     } elsif ($self->{nc} == 0x003E) { # >
2777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2778 wakaba 1.13 if ($self->{in_subset}) {
2779    
2780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781     } else {
2782    
2783     $self->{state} = DATA_STATE;
2784     $self->{s_kwd} = '';
2785     }
2786 wakaba 1.1
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797    
2798     return ($self->{ct}); # comment
2799    
2800     redo A;
2801     } elsif ($self->{nc} == -1) {
2802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 wakaba 1.13 if ($self->{in_subset}) {
2804    
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     } else {
2807    
2808     $self->{state} = DATA_STATE;
2809     $self->{s_kwd} = '';
2810     }
2811 wakaba 1.1 ## reconsume
2812    
2813     return ($self->{ct}); # comment
2814    
2815     redo A;
2816     } else {
2817    
2818     $self->{ct}->{data} # comment
2819     .= '-' . chr ($self->{nc});
2820     $self->{state} = COMMENT_STATE;
2821    
2822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2823     $self->{line_prev} = $self->{line};
2824     $self->{column_prev} = $self->{column};
2825     $self->{column}++;
2826     $self->{nc}
2827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2828     } else {
2829     $self->{set_nc}->($self);
2830     }
2831    
2832     redo A;
2833     }
2834     } elsif ($self->{state} == COMMENT_STATE) {
2835 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2836    
2837 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2838    
2839     $self->{state} = COMMENT_END_DASH_STATE;
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     } elsif ($self->{nc} == -1) {
2853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2854 wakaba 1.13 if ($self->{in_subset}) {
2855    
2856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857     } else {
2858    
2859     $self->{state} = DATA_STATE;
2860     $self->{s_kwd} = '';
2861     }
2862 wakaba 1.1 ## reconsume
2863    
2864     return ($self->{ct}); # comment
2865    
2866     redo A;
2867     } else {
2868    
2869     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2870     $self->{read_until}->($self->{ct}->{data},
2871     q[-],
2872     length $self->{ct}->{data});
2873    
2874     ## Stay in the state
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     }
2888     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2889 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2890 wakaba 1.10
2891 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2892    
2893     $self->{state} = COMMENT_END_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == -1) {
2907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2908 wakaba 1.13 if ($self->{in_subset}) {
2909    
2910     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2911     } else {
2912    
2913     $self->{state} = DATA_STATE;
2914     $self->{s_kwd} = '';
2915     }
2916 wakaba 1.1 ## reconsume
2917    
2918     return ($self->{ct}); # comment
2919    
2920     redo A;
2921     } else {
2922    
2923     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2924     $self->{state} = COMMENT_STATE;
2925    
2926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2927     $self->{line_prev} = $self->{line};
2928     $self->{column_prev} = $self->{column};
2929     $self->{column}++;
2930     $self->{nc}
2931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2932     } else {
2933     $self->{set_nc}->($self);
2934     }
2935    
2936     redo A;
2937     }
2938     } elsif ($self->{state} == COMMENT_END_STATE) {
2939 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2940    
2941 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2942 wakaba 1.13 if ($self->{in_subset}) {
2943    
2944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945     } else {
2946    
2947     $self->{state} = DATA_STATE;
2948     $self->{s_kwd} = '';
2949     }
2950 wakaba 1.1
2951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952     $self->{line_prev} = $self->{line};
2953     $self->{column_prev} = $self->{column};
2954     $self->{column}++;
2955     $self->{nc}
2956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957     } else {
2958     $self->{set_nc}->($self);
2959     }
2960    
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } elsif ($self->{nc} == 0x002D) { # -
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '-'; # comment
2972     ## Stay in the state
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     } elsif ($self->{nc} == -1) {
2986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2987 wakaba 1.13 if ($self->{in_subset}) {
2988    
2989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2990     } else {
2991    
2992     $self->{state} = DATA_STATE;
2993     $self->{s_kwd} = '';
2994     }
2995 wakaba 1.1 ## reconsume
2996    
2997     return ($self->{ct}); # comment
2998    
2999     redo A;
3000     } else {
3001    
3002     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3003     $self->{state} = COMMENT_STATE;
3004    
3005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006     $self->{line_prev} = $self->{line};
3007     $self->{column_prev} = $self->{column};
3008     $self->{column}++;
3009     $self->{nc}
3010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011     } else {
3012     $self->{set_nc}->($self);
3013     }
3014    
3015     redo A;
3016     }
3017     } elsif ($self->{state} == DOCTYPE_STATE) {
3018     if ($is_space->{$self->{nc}}) {
3019    
3020     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3021    
3022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023     $self->{line_prev} = $self->{line};
3024     $self->{column_prev} = $self->{column};
3025     $self->{column}++;
3026     $self->{nc}
3027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028     } else {
3029     $self->{set_nc}->($self);
3030     }
3031    
3032     redo A;
3033 wakaba 1.28 } elsif ($self->{nc} == -1) {
3034    
3035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3036     $self->{ct}->{quirks} = 1;
3037    
3038     $self->{state} = DATA_STATE;
3039     ## Reconsume.
3040     return ($self->{ct}); # DOCTYPE (quirks)
3041    
3042     redo A;
3043 wakaba 1.1 } else {
3044    
3045 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3046 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3047     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3048     ## reconsume
3049     redo A;
3050     }
3051     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3052 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3053    
3054 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3055    
3056     ## Stay in the state
3057    
3058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3059     $self->{line_prev} = $self->{line};
3060     $self->{column_prev} = $self->{column};
3061     $self->{column}++;
3062     $self->{nc}
3063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3064     } else {
3065     $self->{set_nc}->($self);
3066     }
3067    
3068     redo A;
3069     } elsif ($self->{nc} == 0x003E) { # >
3070    
3071 wakaba 1.12 ## XML5: No parse error.
3072 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3073     $self->{state} = DATA_STATE;
3074 wakaba 1.5 $self->{s_kwd} = '';
3075 wakaba 1.1
3076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3077     $self->{line_prev} = $self->{line};
3078     $self->{column_prev} = $self->{column};
3079     $self->{column}++;
3080     $self->{nc}
3081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3082     } else {
3083     $self->{set_nc}->($self);
3084     }
3085    
3086    
3087     return ($self->{ct}); # DOCTYPE (quirks)
3088    
3089     redo A;
3090     } elsif ($self->{nc} == -1) {
3091    
3092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3093     $self->{state} = DATA_STATE;
3094 wakaba 1.5 $self->{s_kwd} = '';
3095 wakaba 1.1 ## reconsume
3096    
3097     return ($self->{ct}); # DOCTYPE (quirks)
3098    
3099     redo A;
3100 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3101    
3102     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3103     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3104 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3105     $self->{in_subset} = 1;
3106 wakaba 1.12
3107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108     $self->{line_prev} = $self->{line};
3109     $self->{column_prev} = $self->{column};
3110     $self->{column}++;
3111     $self->{nc}
3112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3113     } else {
3114     $self->{set_nc}->($self);
3115     }
3116    
3117 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3118 wakaba 1.12 redo A;
3119 wakaba 1.1 } else {
3120    
3121     $self->{ct}->{name} = chr $self->{nc};
3122     delete $self->{ct}->{quirks};
3123     $self->{state} = DOCTYPE_NAME_STATE;
3124    
3125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3126     $self->{line_prev} = $self->{line};
3127     $self->{column_prev} = $self->{column};
3128     $self->{column}++;
3129     $self->{nc}
3130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3131     } else {
3132     $self->{set_nc}->($self);
3133     }
3134    
3135     redo A;
3136     }
3137     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3138 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3139    
3140     ## ISSUE: Redundant "First," in the spec.
3141    
3142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3143    
3144     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3145    
3146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3147     $self->{line_prev} = $self->{line};
3148     $self->{column_prev} = $self->{column};
3149     $self->{column}++;
3150     $self->{nc}
3151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3152     } else {
3153     $self->{set_nc}->($self);
3154     }
3155    
3156     redo A;
3157     } elsif ($self->{nc} == 0x003E) { # >
3158    
3159     $self->{state} = DATA_STATE;
3160 wakaba 1.5 $self->{s_kwd} = '';
3161 wakaba 1.1
3162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3163     $self->{line_prev} = $self->{line};
3164     $self->{column_prev} = $self->{column};
3165     $self->{column}++;
3166     $self->{nc}
3167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3168     } else {
3169     $self->{set_nc}->($self);
3170     }
3171    
3172    
3173     return ($self->{ct}); # DOCTYPE
3174    
3175     redo A;
3176     } elsif ($self->{nc} == -1) {
3177    
3178     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3179     $self->{state} = DATA_STATE;
3180 wakaba 1.5 $self->{s_kwd} = '';
3181 wakaba 1.1 ## reconsume
3182    
3183     $self->{ct}->{quirks} = 1;
3184     return ($self->{ct}); # DOCTYPE
3185    
3186     redo A;
3187 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3188    
3189     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3190 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3191     $self->{in_subset} = 1;
3192 wakaba 1.12
3193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3194     $self->{line_prev} = $self->{line};
3195     $self->{column_prev} = $self->{column};
3196     $self->{column}++;
3197     $self->{nc}
3198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3199     } else {
3200     $self->{set_nc}->($self);
3201     }
3202    
3203 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3204 wakaba 1.12 redo A;
3205 wakaba 1.1 } else {
3206    
3207     $self->{ct}->{name}
3208     .= chr ($self->{nc}); # DOCTYPE
3209     ## Stay in the state
3210    
3211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3212     $self->{line_prev} = $self->{line};
3213     $self->{column_prev} = $self->{column};
3214     $self->{column}++;
3215     $self->{nc}
3216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3217     } else {
3218     $self->{set_nc}->($self);
3219     }
3220    
3221     redo A;
3222     }
3223     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3224 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3225     ## state", but implemented differently.
3226    
3227 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3228    
3229     ## Stay in the state
3230    
3231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232     $self->{line_prev} = $self->{line};
3233     $self->{column_prev} = $self->{column};
3234     $self->{column}++;
3235     $self->{nc}
3236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237     } else {
3238     $self->{set_nc}->($self);
3239     }
3240    
3241     redo A;
3242     } elsif ($self->{nc} == 0x003E) { # >
3243 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3244    
3245     $self->{state} = DATA_STATE;
3246     $self->{s_kwd} = '';
3247     } else {
3248    
3249     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3250     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3251     }
3252 wakaba 1.1
3253    
3254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255     $self->{line_prev} = $self->{line};
3256     $self->{column_prev} = $self->{column};
3257     $self->{column}++;
3258     $self->{nc}
3259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3260     } else {
3261     $self->{set_nc}->($self);
3262     }
3263    
3264 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3265 wakaba 1.1 redo A;
3266     } elsif ($self->{nc} == -1) {
3267 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3268    
3269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3270     $self->{state} = DATA_STATE;
3271     $self->{s_kwd} = '';
3272     $self->{ct}->{quirks} = 1;
3273     } else {
3274    
3275     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3276     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3277     }
3278 wakaba 1.1
3279 wakaba 1.16 ## Reconsume.
3280     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3281 wakaba 1.1 redo A;
3282     } elsif ($self->{nc} == 0x0050 or # P
3283     $self->{nc} == 0x0070) { # p
3284 wakaba 1.12
3285 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3286 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3287 wakaba 1.1
3288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3289     $self->{line_prev} = $self->{line};
3290     $self->{column_prev} = $self->{column};
3291     $self->{column}++;
3292     $self->{nc}
3293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3294     } else {
3295     $self->{set_nc}->($self);
3296     }
3297    
3298     redo A;
3299     } elsif ($self->{nc} == 0x0053 or # S
3300     $self->{nc} == 0x0073) { # s
3301 wakaba 1.12
3302 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3303 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3304    
3305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3306     $self->{line_prev} = $self->{line};
3307     $self->{column_prev} = $self->{column};
3308     $self->{column}++;
3309     $self->{nc}
3310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3311     } else {
3312     $self->{set_nc}->($self);
3313     }
3314    
3315     redo A;
3316 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3317     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3318     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3319    
3320     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3321     $self->{ct}->{value} = ''; # ENTITY
3322    
3323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3324     $self->{line_prev} = $self->{line};
3325     $self->{column_prev} = $self->{column};
3326     $self->{column}++;
3327     $self->{nc}
3328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3329     } else {
3330     $self->{set_nc}->($self);
3331     }
3332    
3333     redo A;
3334     } elsif ($self->{nc} == 0x0027 and # '
3335     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3336     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3337    
3338     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3339     $self->{ct}->{value} = ''; # ENTITY
3340    
3341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3342     $self->{line_prev} = $self->{line};
3343     $self->{column_prev} = $self->{column};
3344     $self->{column}++;
3345     $self->{nc}
3346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3347     } else {
3348     $self->{set_nc}->($self);
3349     }
3350    
3351     redo A;
3352 wakaba 1.16 } elsif ($self->{is_xml} and
3353     $self->{ct}->{type} == DOCTYPE_TOKEN and
3354     $self->{nc} == 0x005B) { # [
3355 wakaba 1.12
3356     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3357     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3358 wakaba 1.13 $self->{in_subset} = 1;
3359 wakaba 1.1
3360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361     $self->{line_prev} = $self->{line};
3362     $self->{column_prev} = $self->{column};
3363     $self->{column}++;
3364     $self->{nc}
3365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366     } else {
3367     $self->{set_nc}->($self);
3368     }
3369    
3370 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3371 wakaba 1.1 redo A;
3372     } else {
3373 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3374    
3375     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3376    
3377     $self->{ct}->{quirks} = 1;
3378     $self->{state} = BOGUS_DOCTYPE_STATE;
3379     } else {
3380    
3381     $self->{state} = BOGUS_MD_STATE;
3382     }
3383 wakaba 1.1
3384    
3385     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3386     $self->{line_prev} = $self->{line};
3387     $self->{column_prev} = $self->{column};
3388     $self->{column}++;
3389     $self->{nc}
3390     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3391     } else {
3392     $self->{set_nc}->($self);
3393     }
3394    
3395     redo A;
3396     }
3397     } elsif ($self->{state} == PUBLIC_STATE) {
3398     ## ASCII case-insensitive
3399     if ($self->{nc} == [
3400     undef,
3401     0x0055, # U
3402     0x0042, # B
3403     0x004C, # L
3404     0x0049, # I
3405 wakaba 1.12 ]->[length $self->{kwd}] or
3406 wakaba 1.1 $self->{nc} == [
3407     undef,
3408     0x0075, # u
3409     0x0062, # b
3410     0x006C, # l
3411     0x0069, # i
3412 wakaba 1.12 ]->[length $self->{kwd}]) {
3413 wakaba 1.1
3414     ## Stay in the state.
3415 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3416 wakaba 1.1
3417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3418     $self->{line_prev} = $self->{line};
3419     $self->{column_prev} = $self->{column};
3420     $self->{column}++;
3421     $self->{nc}
3422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3423     } else {
3424     $self->{set_nc}->($self);
3425     }
3426    
3427     redo A;
3428 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3429 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3430     $self->{nc} == 0x0063)) { # c
3431 wakaba 1.12 if ($self->{is_xml} and
3432     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3433    
3434     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3435     text => 'PUBLIC',
3436     line => $self->{line_prev},
3437     column => $self->{column_prev} - 4);
3438     } else {
3439    
3440     }
3441 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3442    
3443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3444     $self->{line_prev} = $self->{line};
3445     $self->{column_prev} = $self->{column};
3446     $self->{column}++;
3447     $self->{nc}
3448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3449     } else {
3450     $self->{set_nc}->($self);
3451     }
3452    
3453     redo A;
3454     } else {
3455 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3456 wakaba 1.1 line => $self->{line_prev},
3457 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3458 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3459    
3460     $self->{ct}->{quirks} = 1;
3461     $self->{state} = BOGUS_DOCTYPE_STATE;
3462     } else {
3463    
3464     $self->{state} = BOGUS_MD_STATE;
3465     }
3466 wakaba 1.1 ## Reconsume.
3467     redo A;
3468     }
3469     } elsif ($self->{state} == SYSTEM_STATE) {
3470     ## ASCII case-insensitive
3471     if ($self->{nc} == [
3472     undef,
3473     0x0059, # Y
3474     0x0053, # S
3475     0x0054, # T
3476     0x0045, # E
3477 wakaba 1.12 ]->[length $self->{kwd}] or
3478 wakaba 1.1 $self->{nc} == [
3479     undef,
3480     0x0079, # y
3481     0x0073, # s
3482     0x0074, # t
3483     0x0065, # e
3484 wakaba 1.12 ]->[length $self->{kwd}]) {
3485 wakaba 1.1
3486     ## Stay in the state.
3487 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3488 wakaba 1.1
3489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490     $self->{line_prev} = $self->{line};
3491     $self->{column_prev} = $self->{column};
3492     $self->{column}++;
3493     $self->{nc}
3494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495     } else {
3496     $self->{set_nc}->($self);
3497     }
3498    
3499     redo A;
3500 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3501 wakaba 1.1 ($self->{nc} == 0x004D or # M
3502     $self->{nc} == 0x006D)) { # m
3503 wakaba 1.12 if ($self->{is_xml} and
3504     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3505    
3506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3507     text => 'SYSTEM',
3508     line => $self->{line_prev},
3509     column => $self->{column_prev} - 4);
3510     } else {
3511    
3512     }
3513 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3514    
3515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3516     $self->{line_prev} = $self->{line};
3517     $self->{column_prev} = $self->{column};
3518     $self->{column}++;
3519     $self->{nc}
3520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3521     } else {
3522     $self->{set_nc}->($self);
3523     }
3524    
3525     redo A;
3526     } else {
3527 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3528 wakaba 1.1 line => $self->{line_prev},
3529 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3530 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3531    
3532     $self->{ct}->{quirks} = 1;
3533     $self->{state} = BOGUS_DOCTYPE_STATE;
3534     } else {
3535    
3536     $self->{state} = BOGUS_MD_STATE;
3537     }
3538 wakaba 1.1 ## Reconsume.
3539     redo A;
3540     }
3541     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3542     if ($is_space->{$self->{nc}}) {
3543    
3544     ## Stay in the state
3545    
3546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3547     $self->{line_prev} = $self->{line};
3548     $self->{column_prev} = $self->{column};
3549     $self->{column}++;
3550     $self->{nc}
3551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3552     } else {
3553     $self->{set_nc}->($self);
3554     }
3555    
3556     redo A;
3557     } elsif ($self->{nc} eq 0x0022) { # "
3558    
3559     $self->{ct}->{pubid} = ''; # DOCTYPE
3560     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572     redo A;
3573     } elsif ($self->{nc} eq 0x0027) { # '
3574    
3575     $self->{ct}->{pubid} = ''; # DOCTYPE
3576     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3577    
3578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3579     $self->{line_prev} = $self->{line};
3580     $self->{column_prev} = $self->{column};
3581     $self->{column}++;
3582     $self->{nc}
3583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3584     } else {
3585     $self->{set_nc}->($self);
3586     }
3587    
3588     redo A;
3589     } elsif ($self->{nc} eq 0x003E) { # >
3590 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3591    
3592     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3593    
3594     $self->{state} = DATA_STATE;
3595     $self->{s_kwd} = '';
3596     $self->{ct}->{quirks} = 1;
3597     } else {
3598    
3599     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3600     }
3601 wakaba 1.1
3602    
3603     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3604     $self->{line_prev} = $self->{line};
3605     $self->{column_prev} = $self->{column};
3606     $self->{column}++;
3607     $self->{nc}
3608     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3609     } else {
3610     $self->{set_nc}->($self);
3611     }
3612    
3613 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3614 wakaba 1.1 redo A;
3615     } elsif ($self->{nc} == -1) {
3616 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3617    
3618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3619     $self->{state} = DATA_STATE;
3620     $self->{s_kwd} = '';
3621     $self->{ct}->{quirks} = 1;
3622     } else {
3623    
3624     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3625     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3626     }
3627 wakaba 1.1
3628     ## reconsume
3629     return ($self->{ct}); # DOCTYPE
3630     redo A;
3631 wakaba 1.16 } elsif ($self->{is_xml} and
3632     $self->{ct}->{type} == DOCTYPE_TOKEN and
3633     $self->{nc} == 0x005B) { # [
3634 wakaba 1.12
3635     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3636     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3637     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3638 wakaba 1.13 $self->{in_subset} = 1;
3639 wakaba 1.12
3640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3641     $self->{line_prev} = $self->{line};
3642     $self->{column_prev} = $self->{column};
3643     $self->{column}++;
3644     $self->{nc}
3645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3646     } else {
3647     $self->{set_nc}->($self);
3648     }
3649    
3650 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3651 wakaba 1.12 redo A;
3652 wakaba 1.1 } else {
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3654    
3655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656    
3657     $self->{ct}->{quirks} = 1;
3658     $self->{state} = BOGUS_DOCTYPE_STATE;
3659     } else {
3660    
3661     $self->{state} = BOGUS_MD_STATE;
3662     }
3663    
3664 wakaba 1.1
3665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3666     $self->{line_prev} = $self->{line};
3667     $self->{column_prev} = $self->{column};
3668     $self->{column}++;
3669     $self->{nc}
3670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3671     } else {
3672     $self->{set_nc}->($self);
3673     }
3674    
3675     redo A;
3676     }
3677     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3678     if ($self->{nc} == 0x0022) { # "
3679    
3680     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3681    
3682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3683     $self->{line_prev} = $self->{line};
3684     $self->{column_prev} = $self->{column};
3685     $self->{column}++;
3686     $self->{nc}
3687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3688     } else {
3689     $self->{set_nc}->($self);
3690     }
3691    
3692     redo A;
3693     } elsif ($self->{nc} == 0x003E) { # >
3694     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3695    
3696 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3697    
3698     $self->{state} = DATA_STATE;
3699     $self->{s_kwd} = '';
3700     $self->{ct}->{quirks} = 1;
3701     } else {
3702    
3703     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3704     }
3705    
3706 wakaba 1.1
3707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3708     $self->{line_prev} = $self->{line};
3709     $self->{column_prev} = $self->{column};
3710     $self->{column}++;
3711     $self->{nc}
3712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3713     } else {
3714     $self->{set_nc}->($self);
3715     }
3716    
3717 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3718 wakaba 1.1 redo A;
3719     } elsif ($self->{nc} == -1) {
3720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3721    
3722 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3723    
3724     $self->{state} = DATA_STATE;
3725     $self->{s_kwd} = '';
3726     $self->{ct}->{quirks} = 1;
3727     } else {
3728    
3729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3730     }
3731    
3732     ## Reconsume.
3733 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3734     redo A;
3735     } else {
3736    
3737 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3738 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3739     length $self->{ct}->{pubid});
3740    
3741     ## Stay in the state
3742    
3743     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3744     $self->{line_prev} = $self->{line};
3745     $self->{column_prev} = $self->{column};
3746     $self->{column}++;
3747     $self->{nc}
3748     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3749     } else {
3750     $self->{set_nc}->($self);
3751     }
3752    
3753     redo A;
3754     }
3755     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3756     if ($self->{nc} == 0x0027) { # '
3757    
3758     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3759    
3760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3761     $self->{line_prev} = $self->{line};
3762     $self->{column_prev} = $self->{column};
3763     $self->{column}++;
3764     $self->{nc}
3765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3766     } else {
3767     $self->{set_nc}->($self);
3768     }
3769    
3770     redo A;
3771     } elsif ($self->{nc} == 0x003E) { # >
3772     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3773    
3774 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3775    
3776     $self->{state} = DATA_STATE;
3777     $self->{s_kwd} = '';
3778     $self->{ct}->{quirks} = 1;
3779     } else {
3780    
3781     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3782     }
3783    
3784 wakaba 1.1
3785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786     $self->{line_prev} = $self->{line};
3787     $self->{column_prev} = $self->{column};
3788     $self->{column}++;
3789     $self->{nc}
3790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791     } else {
3792     $self->{set_nc}->($self);
3793     }
3794    
3795 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3796 wakaba 1.1 redo A;
3797     } elsif ($self->{nc} == -1) {
3798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3799    
3800 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3801    
3802     $self->{state} = DATA_STATE;
3803     $self->{s_kwd} = '';
3804     $self->{ct}->{quirks} = 1;
3805     } else {
3806    
3807     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3808     }
3809    
3810 wakaba 1.1 ## reconsume
3811 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3812 wakaba 1.1 redo A;
3813     } else {
3814    
3815 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3816 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3817     length $self->{ct}->{pubid});
3818    
3819     ## Stay in the state
3820    
3821     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3822     $self->{line_prev} = $self->{line};
3823     $self->{column_prev} = $self->{column};
3824     $self->{column}++;
3825     $self->{nc}
3826     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3827     } else {
3828     $self->{set_nc}->($self);
3829     }
3830    
3831     redo A;
3832     }
3833     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3834     if ($is_space->{$self->{nc}}) {
3835    
3836     ## Stay in the state
3837    
3838     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3839     $self->{line_prev} = $self->{line};
3840     $self->{column_prev} = $self->{column};
3841     $self->{column}++;
3842     $self->{nc}
3843     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3844     } else {
3845     $self->{set_nc}->($self);
3846     }
3847    
3848     redo A;
3849     } elsif ($self->{nc} == 0x0022) { # "
3850    
3851 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3852 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3853    
3854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3855     $self->{line_prev} = $self->{line};
3856     $self->{column_prev} = $self->{column};
3857     $self->{column}++;
3858     $self->{nc}
3859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3860     } else {
3861     $self->{set_nc}->($self);
3862     }
3863    
3864     redo A;
3865     } elsif ($self->{nc} == 0x0027) { # '
3866    
3867 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3868 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3869    
3870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3871     $self->{line_prev} = $self->{line};
3872     $self->{column_prev} = $self->{column};
3873     $self->{column}++;
3874     $self->{nc}
3875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3876     } else {
3877     $self->{set_nc}->($self);
3878     }
3879    
3880     redo A;
3881     } elsif ($self->{nc} == 0x003E) { # >
3882 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3883     if ($self->{is_xml}) {
3884    
3885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3886     } else {
3887    
3888     }
3889     $self->{state} = DATA_STATE;
3890     $self->{s_kwd} = '';
3891 wakaba 1.12 } else {
3892 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3893    
3894     } else {
3895    
3896     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3897     }
3898     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3899 wakaba 1.12 }
3900 wakaba 1.16
3901 wakaba 1.1
3902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3903     $self->{line_prev} = $self->{line};
3904     $self->{column_prev} = $self->{column};
3905     $self->{column}++;
3906     $self->{nc}
3907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3908     } else {
3909     $self->{set_nc}->($self);
3910     }
3911    
3912 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3913 wakaba 1.1 redo A;
3914     } elsif ($self->{nc} == -1) {
3915 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3916    
3917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3918    
3919     $self->{state} = DATA_STATE;
3920     $self->{s_kwd} = '';
3921     $self->{ct}->{quirks} = 1;
3922     } else {
3923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3924     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3925     }
3926 wakaba 1.1
3927     ## reconsume
3928 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3929 wakaba 1.1 redo A;
3930 wakaba 1.16 } elsif ($self->{is_xml} and
3931     $self->{ct}->{type} == DOCTYPE_TOKEN and
3932     $self->{nc} == 0x005B) { # [
3933 wakaba 1.12
3934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3935     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3936     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3937 wakaba 1.13 $self->{in_subset} = 1;
3938 wakaba 1.12
3939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3940     $self->{line_prev} = $self->{line};
3941     $self->{column_prev} = $self->{column};
3942     $self->{column}++;
3943     $self->{nc}
3944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3945     } else {
3946     $self->{set_nc}->($self);
3947     }
3948    
3949 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3950 wakaba 1.12 redo A;
3951 wakaba 1.1 } else {
3952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3953    
3954 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3955    
3956     $self->{ct}->{quirks} = 1;
3957     $self->{state} = BOGUS_DOCTYPE_STATE;
3958     } else {
3959    
3960     $self->{state} = BOGUS_MD_STATE;
3961     }
3962    
3963 wakaba 1.1
3964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3965     $self->{line_prev} = $self->{line};
3966     $self->{column_prev} = $self->{column};
3967     $self->{column}++;
3968     $self->{nc}
3969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3970     } else {
3971     $self->{set_nc}->($self);
3972     }
3973    
3974     redo A;
3975     }
3976     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3977     if ($is_space->{$self->{nc}}) {
3978    
3979     ## Stay in the state
3980    
3981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982     $self->{line_prev} = $self->{line};
3983     $self->{column_prev} = $self->{column};
3984     $self->{column}++;
3985     $self->{nc}
3986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987     } else {
3988     $self->{set_nc}->($self);
3989     }
3990    
3991     redo A;
3992     } elsif ($self->{nc} == 0x0022) { # "
3993    
3994     $self->{ct}->{sysid} = ''; # DOCTYPE
3995     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3996    
3997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3998     $self->{line_prev} = $self->{line};
3999     $self->{column_prev} = $self->{column};
4000     $self->{column}++;
4001     $self->{nc}
4002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4003     } else {
4004     $self->{set_nc}->($self);
4005     }
4006    
4007     redo A;
4008     } elsif ($self->{nc} == 0x0027) { # '
4009    
4010     $self->{ct}->{sysid} = ''; # DOCTYPE
4011     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4012    
4013     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4014     $self->{line_prev} = $self->{line};
4015     $self->{column_prev} = $self->{column};
4016     $self->{column}++;
4017     $self->{nc}
4018     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4019     } else {
4020     $self->{set_nc}->($self);
4021     }
4022    
4023     redo A;
4024     } elsif ($self->{nc} == 0x003E) { # >
4025     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4026    
4027     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4028     $self->{line_prev} = $self->{line};
4029     $self->{column_prev} = $self->{column};
4030     $self->{column}++;
4031     $self->{nc}
4032     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4033     } else {
4034     $self->{set_nc}->($self);
4035     }
4036    
4037    
4038 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4039    
4040     $self->{state} = DATA_STATE;
4041     $self->{s_kwd} = '';
4042     $self->{ct}->{quirks} = 1;
4043     } else {
4044    
4045     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4046     }
4047 wakaba 1.1
4048 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4049 wakaba 1.1 redo A;
4050     } elsif ($self->{nc} == -1) {
4051 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4052    
4053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4054     $self->{state} = DATA_STATE;
4055     $self->{s_kwd} = '';
4056     $self->{ct}->{quirks} = 1;
4057     } else {
4058    
4059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4061     }
4062 wakaba 1.1
4063     ## reconsume
4064 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4065 wakaba 1.1 redo A;
4066 wakaba 1.16 } elsif ($self->{is_xml} and
4067     $self->{ct}->{type} == DOCTYPE_TOKEN and
4068     $self->{nc} == 0x005B) { # [
4069 wakaba 1.12
4070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4071    
4072     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4073     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4074 wakaba 1.13 $self->{in_subset} = 1;
4075 wakaba 1.12
4076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4077     $self->{line_prev} = $self->{line};
4078     $self->{column_prev} = $self->{column};
4079     $self->{column}++;
4080     $self->{nc}
4081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4082     } else {
4083     $self->{set_nc}->($self);
4084     }
4085    
4086 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4087 wakaba 1.12 redo A;
4088 wakaba 1.1 } else {
4089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4090    
4091 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092    
4093     $self->{ct}->{quirks} = 1;
4094     $self->{state} = BOGUS_DOCTYPE_STATE;
4095     } else {
4096    
4097     $self->{state} = BOGUS_MD_STATE;
4098     }
4099    
4100 wakaba 1.1
4101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4102     $self->{line_prev} = $self->{line};
4103     $self->{column_prev} = $self->{column};
4104     $self->{column}++;
4105     $self->{nc}
4106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4107     } else {
4108     $self->{set_nc}->($self);
4109     }
4110    
4111     redo A;
4112     }
4113     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4114     if ($self->{nc} == 0x0022) { # "
4115    
4116     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4117    
4118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119     $self->{line_prev} = $self->{line};
4120     $self->{column_prev} = $self->{column};
4121     $self->{column}++;
4122     $self->{nc}
4123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124     } else {
4125     $self->{set_nc}->($self);
4126     }
4127    
4128     redo A;
4129 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4130 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4131    
4132 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4133    
4134     $self->{state} = DATA_STATE;
4135     $self->{s_kwd} = '';
4136     $self->{ct}->{quirks} = 1;
4137     } else {
4138    
4139     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4140     }
4141    
4142 wakaba 1.1
4143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144     $self->{line_prev} = $self->{line};
4145     $self->{column_prev} = $self->{column};
4146     $self->{column}++;
4147     $self->{nc}
4148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4149     } else {
4150     $self->{set_nc}->($self);
4151     }
4152    
4153 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4154 wakaba 1.1 redo A;
4155     } elsif ($self->{nc} == -1) {
4156     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4157    
4158 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4159    
4160     $self->{state} = DATA_STATE;
4161     $self->{s_kwd} = '';
4162     $self->{ct}->{quirks} = 1;
4163     } else {
4164    
4165     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4166     }
4167    
4168 wakaba 1.1 ## reconsume
4169 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4170 wakaba 1.1 redo A;
4171     } else {
4172    
4173 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4174 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4175     length $self->{ct}->{sysid});
4176    
4177     ## Stay in the state
4178    
4179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180     $self->{line_prev} = $self->{line};
4181     $self->{column_prev} = $self->{column};
4182     $self->{column}++;
4183     $self->{nc}
4184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4185     } else {
4186     $self->{set_nc}->($self);
4187     }
4188    
4189     redo A;
4190     }
4191     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4192     if ($self->{nc} == 0x0027) { # '
4193    
4194     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4195    
4196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4197     $self->{line_prev} = $self->{line};
4198     $self->{column_prev} = $self->{column};
4199     $self->{column}++;
4200     $self->{nc}
4201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4202     } else {
4203     $self->{set_nc}->($self);
4204     }
4205    
4206     redo A;
4207 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4208 wakaba 1.1
4209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4210    
4211     $self->{state} = DATA_STATE;
4212 wakaba 1.5 $self->{s_kwd} = '';
4213 wakaba 1.1
4214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215     $self->{line_prev} = $self->{line};
4216     $self->{column_prev} = $self->{column};
4217     $self->{column}++;
4218     $self->{nc}
4219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220     } else {
4221     $self->{set_nc}->($self);
4222     }
4223    
4224    
4225     $self->{ct}->{quirks} = 1;
4226     return ($self->{ct}); # DOCTYPE
4227    
4228     redo A;
4229     } elsif ($self->{nc} == -1) {
4230     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4231    
4232 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4233    
4234     $self->{state} = DATA_STATE;
4235     $self->{s_kwd} = '';
4236     $self->{ct}->{quirks} = 1;
4237     } else {
4238    
4239     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4240     }
4241    
4242 wakaba 1.1 ## reconsume
4243 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4244 wakaba 1.1 redo A;
4245     } else {
4246    
4247 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4248 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4249     length $self->{ct}->{sysid});
4250    
4251     ## Stay in the state
4252    
4253     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4254     $self->{line_prev} = $self->{line};
4255     $self->{column_prev} = $self->{column};
4256     $self->{column}++;
4257     $self->{nc}
4258     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4259     } else {
4260     $self->{set_nc}->($self);
4261     }
4262    
4263     redo A;
4264     }
4265     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4266     if ($is_space->{$self->{nc}}) {
4267 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4268    
4269     $self->{state} = BEFORE_NDATA_STATE;
4270     } else {
4271    
4272     ## Stay in the state
4273     }
4274 wakaba 1.1
4275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4276     $self->{line_prev} = $self->{line};
4277     $self->{column_prev} = $self->{column};
4278     $self->{column}++;
4279     $self->{nc}
4280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4281     } else {
4282     $self->{set_nc}->($self);
4283     }
4284    
4285     redo A;
4286     } elsif ($self->{nc} == 0x003E) { # >
4287 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4288    
4289     $self->{state} = DATA_STATE;
4290     $self->{s_kwd} = '';
4291     } else {
4292    
4293     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4294     }
4295    
4296 wakaba 1.1
4297     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4298     $self->{line_prev} = $self->{line};
4299     $self->{column_prev} = $self->{column};
4300     $self->{column}++;
4301     $self->{nc}
4302     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4303     } else {
4304     $self->{set_nc}->($self);
4305     }
4306    
4307 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4308 wakaba 1.1 redo A;
4309 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4310     ($self->{nc} == 0x004E or # N
4311     $self->{nc} == 0x006E)) { # n
4312    
4313     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4314     $self->{state} = NDATA_STATE;
4315     $self->{kwd} = chr $self->{nc};
4316    
4317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4318     $self->{line_prev} = $self->{line};
4319     $self->{column_prev} = $self->{column};
4320     $self->{column}++;
4321     $self->{nc}
4322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4323     } else {
4324     $self->{set_nc}->($self);
4325     }
4326    
4327     redo A;
4328 wakaba 1.1 } elsif ($self->{nc} == -1) {
4329 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4330    
4331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4332     $self->{state} = DATA_STATE;
4333     $self->{s_kwd} = '';
4334     $self->{ct}->{quirks} = 1;
4335     } else {
4336    
4337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4339     }
4340    
4341 wakaba 1.1 ## reconsume
4342 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4343 wakaba 1.1 redo A;
4344 wakaba 1.16 } elsif ($self->{is_xml} and
4345     $self->{ct}->{type} == DOCTYPE_TOKEN and
4346     $self->{nc} == 0x005B) { # [
4347 wakaba 1.12
4348     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4349     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4350 wakaba 1.13 $self->{in_subset} = 1;
4351 wakaba 1.12
4352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4353     $self->{line_prev} = $self->{line};
4354     $self->{column_prev} = $self->{column};
4355     $self->{column}++;
4356     $self->{nc}
4357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4358     } else {
4359     $self->{set_nc}->($self);
4360     }
4361    
4362 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4363 wakaba 1.12 redo A;
4364 wakaba 1.1 } else {
4365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4366    
4367 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4368    
4369     #$self->{ct}->{quirks} = 1;
4370     $self->{state} = BOGUS_DOCTYPE_STATE;
4371     } else {
4372    
4373     $self->{state} = BOGUS_MD_STATE;
4374     }
4375    
4376 wakaba 1.1
4377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4378     $self->{line_prev} = $self->{line};
4379     $self->{column_prev} = $self->{column};
4380     $self->{column}++;
4381     $self->{nc}
4382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4383     } else {
4384     $self->{set_nc}->($self);
4385     }
4386    
4387     redo A;
4388     }
4389 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4390     if ($is_space->{$self->{nc}}) {
4391    
4392     ## Stay in the state.
4393    
4394     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4395     $self->{line_prev} = $self->{line};
4396     $self->{column_prev} = $self->{column};
4397     $self->{column}++;
4398     $self->{nc}
4399     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4400     } else {
4401     $self->{set_nc}->($self);
4402     }
4403    
4404     redo A;
4405     } elsif ($self->{nc} == 0x003E) { # >
4406    
4407     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     return ($self->{ct}); # ENTITY
4420     redo A;
4421     } elsif ($self->{nc} == 0x004E or # N
4422     $self->{nc} == 0x006E) { # n
4423    
4424     $self->{state} = NDATA_STATE;
4425     $self->{kwd} = chr $self->{nc};
4426    
4427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428     $self->{line_prev} = $self->{line};
4429     $self->{column_prev} = $self->{column};
4430     $self->{column}++;
4431     $self->{nc}
4432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433     } else {
4434     $self->{set_nc}->($self);
4435     }
4436    
4437     redo A;
4438     } elsif ($self->{nc} == -1) {
4439    
4440     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4441     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4442     ## reconsume
4443     return ($self->{ct}); # ENTITY
4444     redo A;
4445     } else {
4446    
4447     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4448     $self->{state} = BOGUS_MD_STATE;
4449    
4450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4451     $self->{line_prev} = $self->{line};
4452     $self->{column_prev} = $self->{column};
4453     $self->{column}++;
4454     $self->{nc}
4455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4456     } else {
4457     $self->{set_nc}->($self);
4458     }
4459    
4460     redo A;
4461     }
4462 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4463     if ($self->{nc} == 0x003E) { # >
4464    
4465     $self->{state} = DATA_STATE;
4466 wakaba 1.5 $self->{s_kwd} = '';
4467 wakaba 1.1
4468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4469     $self->{line_prev} = $self->{line};
4470     $self->{column_prev} = $self->{column};
4471     $self->{column}++;
4472     $self->{nc}
4473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4474     } else {
4475     $self->{set_nc}->($self);
4476     }
4477    
4478    
4479     return ($self->{ct}); # DOCTYPE
4480    
4481     redo A;
4482 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4483 wakaba 1.13
4484     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4485     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4486     $self->{in_subset} = 1;
4487    
4488 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4489     $self->{line_prev} = $self->{line};
4490     $self->{column_prev} = $self->{column};
4491     $self->{column}++;
4492     $self->{nc}
4493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4494     } else {
4495     $self->{set_nc}->($self);
4496     }
4497    
4498 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4499     redo A;
4500 wakaba 1.1 } elsif ($self->{nc} == -1) {
4501    
4502     $self->{state} = DATA_STATE;
4503 wakaba 1.5 $self->{s_kwd} = '';
4504 wakaba 1.1 ## reconsume
4505    
4506     return ($self->{ct}); # DOCTYPE
4507    
4508     redo A;
4509     } else {
4510    
4511     my $s = '';
4512 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4513 wakaba 1.1
4514     ## Stay in the state
4515    
4516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4517     $self->{line_prev} = $self->{line};
4518     $self->{column_prev} = $self->{column};
4519     $self->{column}++;
4520     $self->{nc}
4521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4522     } else {
4523     $self->{set_nc}->($self);
4524     }
4525    
4526     redo A;
4527     }
4528     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4529     ## NOTE: "CDATA section state" in the state is jointly implemented
4530     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4531     ## and |CDATA_SECTION_MSE2_STATE|.
4532 wakaba 1.10
4533     ## XML5: "CDATA state".
4534 wakaba 1.1
4535     if ($self->{nc} == 0x005D) { # ]
4536    
4537     $self->{state} = CDATA_SECTION_MSE1_STATE;
4538    
4539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540     $self->{line_prev} = $self->{line};
4541     $self->{column_prev} = $self->{column};
4542     $self->{column}++;
4543     $self->{nc}
4544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545     } else {
4546     $self->{set_nc}->($self);
4547     }
4548    
4549     redo A;
4550     } elsif ($self->{nc} == -1) {
4551 wakaba 1.6 if ($self->{is_xml}) {
4552 wakaba 1.8
4553 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4554 wakaba 1.8 } else {
4555    
4556 wakaba 1.6 }
4557    
4558 wakaba 1.1 $self->{state} = DATA_STATE;
4559 wakaba 1.5 $self->{s_kwd} = '';
4560 wakaba 1.10 ## Reconsume.
4561 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4562    
4563     return ($self->{ct}); # character
4564     } else {
4565    
4566     ## No token to emit. $self->{ct} is discarded.
4567     }
4568     redo A;
4569     } else {
4570    
4571     $self->{ct}->{data} .= chr $self->{nc};
4572     $self->{read_until}->($self->{ct}->{data},
4573     q<]>,
4574     length $self->{ct}->{data});
4575    
4576     ## Stay in the state.
4577    
4578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4579     $self->{line_prev} = $self->{line};
4580     $self->{column_prev} = $self->{column};
4581     $self->{column}++;
4582     $self->{nc}
4583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4584     } else {
4585     $self->{set_nc}->($self);
4586     }
4587    
4588     redo A;
4589     }
4590    
4591     ## ISSUE: "text tokens" in spec.
4592     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4593 wakaba 1.10 ## XML5: "CDATA bracket state".
4594    
4595 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4596    
4597     $self->{state} = CDATA_SECTION_MSE2_STATE;
4598    
4599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4600     $self->{line_prev} = $self->{line};
4601     $self->{column_prev} = $self->{column};
4602     $self->{column}++;
4603     $self->{nc}
4604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4605     } else {
4606     $self->{set_nc}->($self);
4607     }
4608    
4609     redo A;
4610     } else {
4611    
4612 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4613 wakaba 1.1 $self->{ct}->{data} .= ']';
4614 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4615 wakaba 1.1 ## Reconsume.
4616     redo A;
4617     }
4618     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4619 wakaba 1.10 ## XML5: "CDATA end state".
4620    
4621 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4622     $self->{state} = DATA_STATE;
4623 wakaba 1.5 $self->{s_kwd} = '';
4624 wakaba 1.1
4625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4626     $self->{line_prev} = $self->{line};
4627     $self->{column_prev} = $self->{column};
4628     $self->{column}++;
4629     $self->{nc}
4630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4631     } else {
4632     $self->{set_nc}->($self);
4633     }
4634    
4635     if (length $self->{ct}->{data}) { # character
4636    
4637     return ($self->{ct}); # character
4638     } else {
4639    
4640     ## No token to emit. $self->{ct} is discarded.
4641     }
4642     redo A;
4643     } elsif ($self->{nc} == 0x005D) { # ]
4644     # character
4645     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4646     ## Stay in the state.
4647    
4648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4649     $self->{line_prev} = $self->{line};
4650     $self->{column_prev} = $self->{column};
4651     $self->{column}++;
4652     $self->{nc}
4653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4654     } else {
4655     $self->{set_nc}->($self);
4656     }
4657    
4658     redo A;
4659     } else {
4660    
4661     $self->{ct}->{data} .= ']]'; # character
4662     $self->{state} = CDATA_SECTION_STATE;
4663 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4664 wakaba 1.1 redo A;
4665     }
4666     } elsif ($self->{state} == ENTITY_STATE) {
4667     if ($is_space->{$self->{nc}} or
4668     {
4669     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4670     $self->{entity_add} => 1,
4671     }->{$self->{nc}}) {
4672 wakaba 1.22 if ($self->{is_xml}) {
4673    
4674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4675     line => $self->{line_prev},
4676     column => $self->{column_prev}
4677     + ($self->{nc} == -1 ? 1 : 0));
4678     } else {
4679    
4680     ## No error
4681     }
4682 wakaba 1.1 ## Don't consume
4683     ## Return nothing.
4684     #
4685     } elsif ($self->{nc} == 0x0023) { # #
4686    
4687     $self->{state} = ENTITY_HASH_STATE;
4688 wakaba 1.12 $self->{kwd} = '#';
4689 wakaba 1.1
4690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691     $self->{line_prev} = $self->{line};
4692     $self->{column_prev} = $self->{column};
4693     $self->{column}++;
4694     $self->{nc}
4695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696     } else {
4697     $self->{set_nc}->($self);
4698     }
4699    
4700     redo A;
4701 wakaba 1.22 } elsif ($self->{is_xml} or
4702     (0x0041 <= $self->{nc} and
4703 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4704     (0x0061 <= $self->{nc} and
4705     $self->{nc} <= 0x007A)) { # a..z
4706    
4707     require Whatpm::_NamedEntityList;
4708     $self->{state} = ENTITY_NAME_STATE;
4709 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4710     $self->{entity__value} = $self->{kwd};
4711 wakaba 1.1 $self->{entity__match} = 0;
4712    
4713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4714     $self->{line_prev} = $self->{line};
4715     $self->{column_prev} = $self->{column};
4716     $self->{column}++;
4717     $self->{nc}
4718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4719     } else {
4720     $self->{set_nc}->($self);
4721     }
4722    
4723     redo A;
4724     } else {
4725    
4726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4727     ## Return nothing.
4728     #
4729     }
4730    
4731     ## NOTE: No character is consumed by the "consume a character
4732     ## reference" algorithm. In other word, there is an "&" character
4733     ## that does not introduce a character reference, which would be
4734     ## appended to the parent element or the attribute value in later
4735     ## process of the tokenizer.
4736    
4737     if ($self->{prev_state} == DATA_STATE) {
4738    
4739     $self->{state} = $self->{prev_state};
4740 wakaba 1.5 $self->{s_kwd} = '';
4741 wakaba 1.1 ## Reconsume.
4742     return ({type => CHARACTER_TOKEN, data => '&',
4743     line => $self->{line_prev},
4744     column => $self->{column_prev},
4745     });
4746     redo A;
4747     } else {
4748    
4749     $self->{ca}->{value} .= '&';
4750     $self->{state} = $self->{prev_state};
4751 wakaba 1.5 $self->{s_kwd} = '';
4752 wakaba 1.1 ## Reconsume.
4753     redo A;
4754     }
4755     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4756 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4757 wakaba 1.1
4758     $self->{state} = HEXREF_X_STATE;
4759 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4760 wakaba 1.1
4761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762     $self->{line_prev} = $self->{line};
4763     $self->{column_prev} = $self->{column};
4764     $self->{column}++;
4765     $self->{nc}
4766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767     } else {
4768     $self->{set_nc}->($self);
4769     }
4770    
4771     redo A;
4772 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4773    
4774     if ($self->{is_xml}) {
4775     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4776     }
4777     $self->{state} = HEXREF_X_STATE;
4778     $self->{kwd} .= chr $self->{nc};
4779    
4780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4781     $self->{line_prev} = $self->{line};
4782     $self->{column_prev} = $self->{column};
4783     $self->{column}++;
4784     $self->{nc}
4785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4786     } else {
4787     $self->{set_nc}->($self);
4788     }
4789    
4790     redo A;
4791 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4792     $self->{nc} <= 0x0039) { # 0..9
4793    
4794     $self->{state} = NCR_NUM_STATE;
4795 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4796 wakaba 1.1
4797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4798     $self->{line_prev} = $self->{line};
4799     $self->{column_prev} = $self->{column};
4800     $self->{column}++;
4801     $self->{nc}
4802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4803     } else {
4804     $self->{set_nc}->($self);
4805     }
4806    
4807     redo A;
4808     } else {
4809     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4810     line => $self->{line_prev},
4811     column => $self->{column_prev} - 1);
4812    
4813     ## NOTE: According to the spec algorithm, nothing is returned,
4814     ## and then "&#" is appended to the parent element or the attribute
4815     ## value in the later processing.
4816    
4817     if ($self->{prev_state} == DATA_STATE) {
4818    
4819     $self->{state} = $self->{prev_state};
4820 wakaba 1.5 $self->{s_kwd} = '';
4821 wakaba 1.1 ## Reconsume.
4822     return ({type => CHARACTER_TOKEN,
4823     data => '&#',
4824     line => $self->{line_prev},
4825     column => $self->{column_prev} - 1,
4826     });
4827     redo A;
4828     } else {
4829    
4830     $self->{ca}->{value} .= '&#';
4831     $self->{state} = $self->{prev_state};
4832 wakaba 1.5 $self->{s_kwd} = '';
4833 wakaba 1.1 ## Reconsume.
4834     redo A;
4835     }
4836     }
4837     } elsif ($self->{state} == NCR_NUM_STATE) {
4838     if (0x0030 <= $self->{nc} and
4839     $self->{nc} <= 0x0039) { # 0..9
4840    
4841 wakaba 1.12 $self->{kwd} *= 10;
4842     $self->{kwd} += $self->{nc} - 0x0030;
4843 wakaba 1.1
4844     ## Stay in the state.
4845    
4846     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4847     $self->{line_prev} = $self->{line};
4848     $self->{column_prev} = $self->{column};
4849     $self->{column}++;
4850     $self->{nc}
4851     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4852     } else {
4853     $self->{set_nc}->($self);
4854     }
4855    
4856     redo A;
4857     } elsif ($self->{nc} == 0x003B) { # ;
4858    
4859    
4860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4861     $self->{line_prev} = $self->{line};
4862     $self->{column_prev} = $self->{column};
4863     $self->{column}++;
4864     $self->{nc}
4865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4866     } else {
4867     $self->{set_nc}->($self);
4868     }
4869    
4870     #
4871     } else {
4872    
4873     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4874     ## Reconsume.
4875     #
4876     }
4877    
4878 wakaba 1.12 my $code = $self->{kwd};
4879 wakaba 1.1 my $l = $self->{line_prev};
4880     my $c = $self->{column_prev};
4881 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4882     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4883     ($self->{is_xml} and $code == 0x0000)) {
4884 wakaba 1.1
4885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4886     text => (sprintf 'U+%04X', $code),
4887     line => $l, column => $c);
4888     $code = $charref_map->{$code};
4889     } elsif ($code > 0x10FFFF) {
4890    
4891     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4892     text => (sprintf 'U-%08X', $code),
4893     line => $l, column => $c);
4894     $code = 0xFFFD;
4895     }
4896    
4897     if ($self->{prev_state} == DATA_STATE) {
4898    
4899     $self->{state} = $self->{prev_state};
4900 wakaba 1.5 $self->{s_kwd} = '';
4901 wakaba 1.1 ## Reconsume.
4902     return ({type => CHARACTER_TOKEN, data => chr $code,
4903 wakaba 1.7 has_reference => 1,
4904 wakaba 1.1 line => $l, column => $c,
4905     });
4906     redo A;
4907     } else {
4908    
4909     $self->{ca}->{value} .= chr $code;
4910     $self->{ca}->{has_reference} = 1;
4911     $self->{state} = $self->{prev_state};
4912 wakaba 1.5 $self->{s_kwd} = '';
4913 wakaba 1.1 ## Reconsume.
4914     redo A;
4915     }
4916     } elsif ($self->{state} == HEXREF_X_STATE) {
4917     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4918     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4919     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4920     # 0..9, A..F, a..f
4921    
4922     $self->{state} = HEXREF_HEX_STATE;
4923 wakaba 1.12 $self->{kwd} = 0;
4924 wakaba 1.1 ## Reconsume.
4925     redo A;
4926     } else {
4927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4928     line => $self->{line_prev},
4929     column => $self->{column_prev} - 2);
4930    
4931     ## NOTE: According to the spec algorithm, nothing is returned,
4932     ## and then "&#" followed by "X" or "x" is appended to the parent
4933     ## element or the attribute value in the later processing.
4934    
4935     if ($self->{prev_state} == DATA_STATE) {
4936    
4937     $self->{state} = $self->{prev_state};
4938 wakaba 1.5 $self->{s_kwd} = '';
4939 wakaba 1.1 ## Reconsume.
4940     return ({type => CHARACTER_TOKEN,
4941 wakaba 1.12 data => '&' . $self->{kwd},
4942 wakaba 1.1 line => $self->{line_prev},
4943 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4944 wakaba 1.1 });
4945     redo A;
4946     } else {
4947    
4948 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4949 wakaba 1.1 $self->{state} = $self->{prev_state};
4950 wakaba 1.5 $self->{s_kwd} = '';
4951 wakaba 1.1 ## Reconsume.
4952     redo A;
4953     }
4954     }
4955     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4956     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4957     # 0..9
4958    
4959 wakaba 1.12 $self->{kwd} *= 0x10;
4960     $self->{kwd} += $self->{nc} - 0x0030;
4961 wakaba 1.1 ## Stay in the state.
4962    
4963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964     $self->{line_prev} = $self->{line};
4965     $self->{column_prev} = $self->{column};
4966     $self->{column}++;
4967     $self->{nc}
4968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969     } else {
4970     $self->{set_nc}->($self);
4971     }
4972    
4973     redo A;
4974     } elsif (0x0061 <= $self->{nc} and
4975     $self->{nc} <= 0x0066) { # a..f
4976    
4977 wakaba 1.12 $self->{kwd} *= 0x10;
4978     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4979 wakaba 1.1 ## Stay in the state.
4980    
4981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4982     $self->{line_prev} = $self->{line};
4983     $self->{column_prev} = $self->{column};
4984     $self->{column}++;
4985     $self->{nc}
4986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4987     } else {
4988     $self->{set_nc}->($self);
4989     }
4990    
4991     redo A;
4992     } elsif (0x0041 <= $self->{nc} and
4993     $self->{nc} <= 0x0046) { # A..F
4994    
4995 wakaba 1.12 $self->{kwd} *= 0x10;
4996     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4997 wakaba 1.1 ## Stay in the state.
4998    
4999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5000     $self->{line_prev} = $self->{line};
5001     $self->{column_prev} = $self->{column};
5002     $self->{column}++;
5003     $self->{nc}
5004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5005     } else {
5006     $self->{set_nc}->($self);
5007     }
5008    
5009     redo A;
5010     } elsif ($self->{nc} == 0x003B) { # ;
5011    
5012    
5013     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5014     $self->{line_prev} = $self->{line};
5015     $self->{column_prev} = $self->{column};
5016     $self->{column}++;
5017     $self->{nc}
5018     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5019     } else {
5020     $self->{set_nc}->($self);
5021     }
5022    
5023     #
5024     } else {
5025    
5026     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5027     line => $self->{line},
5028     column => $self->{column});
5029     ## Reconsume.
5030     #
5031     }
5032    
5033 wakaba 1.12 my $code = $self->{kwd};
5034 wakaba 1.1 my $l = $self->{line_prev};
5035     my $c = $self->{column_prev};
5036 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5037     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5038     ($self->{is_xml} and $code == 0x0000)) {
5039 wakaba 1.1
5040     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5041     text => (sprintf 'U+%04X', $code),
5042     line => $l, column => $c);
5043     $code = $charref_map->{$code};
5044     } elsif ($code > 0x10FFFF) {
5045    
5046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5047     text => (sprintf 'U-%08X', $code),
5048     line => $l, column => $c);
5049     $code = 0xFFFD;
5050     }
5051    
5052     if ($self->{prev_state} == DATA_STATE) {
5053    
5054     $self->{state} = $self->{prev_state};
5055 wakaba 1.5 $self->{s_kwd} = '';
5056 wakaba 1.1 ## Reconsume.
5057     return ({type => CHARACTER_TOKEN, data => chr $code,
5058 wakaba 1.7 has_reference => 1,
5059 wakaba 1.1 line => $l, column => $c,
5060     });
5061     redo A;
5062     } else {
5063    
5064     $self->{ca}->{value} .= chr $code;
5065     $self->{ca}->{has_reference} = 1;
5066     $self->{state} = $self->{prev_state};
5067 wakaba 1.5 $self->{s_kwd} = '';
5068 wakaba 1.1 ## Reconsume.
5069     redo A;
5070     }
5071     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5072 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5073     $self->{nc} <= 0x005A) or # x
5074     (0x0061 <= $self->{nc} and # a
5075     $self->{nc} <= 0x007A) or # z
5076     (0x0030 <= $self->{nc} and # 0
5077     $self->{nc} <= 0x0039) or # 9
5078 wakaba 1.22 $self->{nc} == 0x003B or # ;
5079     ($self->{is_xml} and
5080     not ($is_space->{$self->{nc}} or
5081     {
5082     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5083     $self->{entity_add} => 1,
5084     }->{$self->{nc}}))) {
5085 wakaba 1.1 our $EntityChar;
5086 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5087 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5088     $self->{ge}->{$self->{kwd}}) {
5089 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5090 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5091     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5092    
5093     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5094     } else {
5095     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5096    
5097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5098     value => $self->{kwd});
5099     } else {
5100    
5101     }
5102     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5103     }
5104     } else {
5105     if ($self->{is_xml}) {
5106    
5107     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5108     value => $self->{kwd},
5109     level => {
5110     'amp;' => $self->{level}->{warn},
5111     'quot;' => $self->{level}->{warn},
5112     'lt;' => $self->{level}->{warn},
5113     'gt;' => $self->{level}->{warn},
5114     'apos;' => $self->{level}->{warn},
5115     }->{$self->{kwd}} ||
5116     $self->{level}->{must});
5117     } else {
5118    
5119     }
5120     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5121     }
5122 wakaba 1.1 $self->{entity__match} = 1;
5123    
5124     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5125     $self->{line_prev} = $self->{line};
5126     $self->{column_prev} = $self->{column};
5127     $self->{column}++;
5128     $self->{nc}
5129     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5130     } else {
5131     $self->{set_nc}->($self);
5132     }
5133    
5134     #
5135     } else {
5136    
5137 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5138 wakaba 1.1 $self->{entity__match} = -1;
5139     ## Stay in the state.
5140    
5141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5142     $self->{line_prev} = $self->{line};
5143     $self->{column_prev} = $self->{column};
5144     $self->{column}++;
5145     $self->{nc}
5146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5147     } else {
5148     $self->{set_nc}->($self);
5149     }
5150    
5151     redo A;
5152     }
5153     } else {
5154    
5155     $self->{entity__value} .= chr $self->{nc};
5156     $self->{entity__match} *= 2;
5157     ## Stay in the state.
5158    
5159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5160     $self->{line_prev} = $self->{line};
5161     $self->{column_prev} = $self->{column};
5162     $self->{column}++;
5163     $self->{nc}
5164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5165     } else {
5166     $self->{set_nc}->($self);
5167     }
5168    
5169     redo A;
5170     }
5171     }
5172    
5173     my $data;
5174     my $has_ref;
5175     if ($self->{entity__match} > 0) {
5176    
5177     $data = $self->{entity__value};
5178     $has_ref = 1;
5179     #
5180     } elsif ($self->{entity__match} < 0) {
5181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5182     if ($self->{prev_state} != DATA_STATE and # in attribute
5183     $self->{entity__match} < -1) {
5184    
5185 wakaba 1.12 $data = '&' . $self->{kwd};
5186 wakaba 1.1 #
5187     } else {
5188    
5189     $data = $self->{entity__value};
5190     $has_ref = 1;
5191     #
5192     }
5193     } else {
5194    
5195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5196     line => $self->{line_prev},
5197 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5198     $data = '&' . $self->{kwd};
5199 wakaba 1.1 #
5200     }
5201    
5202     ## NOTE: In these cases, when a character reference is found,
5203     ## it is consumed and a character token is returned, or, otherwise,
5204     ## nothing is consumed and returned, according to the spec algorithm.
5205     ## In this implementation, anything that has been examined by the
5206     ## tokenizer is appended to the parent element or the attribute value
5207     ## as string, either literal string when no character reference or
5208     ## entity-replaced string otherwise, in this stage, since any characters
5209     ## that would not be consumed are appended in the data state or in an
5210     ## appropriate attribute value state anyway.
5211    
5212     if ($self->{prev_state} == DATA_STATE) {
5213    
5214     $self->{state} = $self->{prev_state};
5215 wakaba 1.5 $self->{s_kwd} = '';
5216 wakaba 1.1 ## Reconsume.
5217     return ({type => CHARACTER_TOKEN,
5218     data => $data,
5219 wakaba 1.7 has_reference => $has_ref,
5220 wakaba 1.1 line => $self->{line_prev},
5221 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5222 wakaba 1.1 });
5223     redo A;
5224     } else {
5225    
5226     $self->{ca}->{value} .= $data;
5227     $self->{ca}->{has_reference} = 1 if $has_ref;
5228     $self->{state} = $self->{prev_state};
5229 wakaba 1.5 $self->{s_kwd} = '';
5230 wakaba 1.1 ## Reconsume.
5231     redo A;
5232     }
5233 wakaba 1.8
5234     ## XML-only states
5235    
5236     } elsif ($self->{state} == PI_STATE) {
5237 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5238    
5239 wakaba 1.8 if ($is_space->{$self->{nc}} or
5240 wakaba 1.14 $self->{nc} == 0x003F or # ?
5241 wakaba 1.8 $self->{nc} == -1) {
5242 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5243     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5244     ## "DOCTYPE pi state": Parse error, switch to the "data
5245     ## state".
5246 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5247     line => $self->{line_prev},
5248     column => $self->{column_prev}
5249     - 1 * ($self->{nc} != -1));
5250     $self->{state} = BOGUS_COMMENT_STATE;
5251     ## Reconsume.
5252     $self->{ct} = {type => COMMENT_TOKEN,
5253     data => '?',
5254     line => $self->{line_prev},
5255     column => $self->{column_prev}
5256     - 1 * ($self->{nc} != -1),
5257     };
5258     redo A;
5259     } else {
5260 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5261 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5262     target => chr $self->{nc},
5263     data => '',
5264     line => $self->{line_prev},
5265     column => $self->{column_prev} - 1,
5266     };
5267     $self->{state} = PI_TARGET_STATE;
5268    
5269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5270     $self->{line_prev} = $self->{line};
5271     $self->{column_prev} = $self->{column};
5272     $self->{column}++;
5273     $self->{nc}
5274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5275     } else {
5276     $self->{set_nc}->($self);
5277     }
5278    
5279     redo A;
5280     }
5281     } elsif ($self->{state} == PI_TARGET_STATE) {
5282     if ($is_space->{$self->{nc}}) {
5283     $self->{state} = PI_TARGET_AFTER_STATE;
5284    
5285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5286     $self->{line_prev} = $self->{line};
5287     $self->{column_prev} = $self->{column};
5288     $self->{column}++;
5289     $self->{nc}
5290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5291     } else {
5292     $self->{set_nc}->($self);
5293     }
5294    
5295     redo A;
5296     } elsif ($self->{nc} == -1) {
5297     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5298 wakaba 1.13 if ($self->{in_subset}) {
5299     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5300     } else {
5301     $self->{state} = DATA_STATE;
5302     $self->{s_kwd} = '';
5303     }
5304 wakaba 1.8 ## Reconsume.
5305     return ($self->{ct}); # pi
5306     redo A;
5307     } elsif ($self->{nc} == 0x003F) { # ?
5308     $self->{state} = PI_AFTER_STATE;
5309    
5310     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5311     $self->{line_prev} = $self->{line};
5312     $self->{column_prev} = $self->{column};
5313     $self->{column}++;
5314     $self->{nc}
5315     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5316     } else {
5317     $self->{set_nc}->($self);
5318     }
5319    
5320     redo A;
5321     } else {
5322     ## XML5: typo ("tag name" -> "target")
5323     $self->{ct}->{target} .= chr $self->{nc}; # pi
5324    
5325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326     $self->{line_prev} = $self->{line};
5327     $self->{column_prev} = $self->{column};
5328     $self->{column}++;
5329     $self->{nc}
5330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331     } else {
5332     $self->{set_nc}->($self);
5333     }
5334    
5335     redo A;
5336     }
5337     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5338     if ($is_space->{$self->{nc}}) {
5339     ## Stay in the state.
5340    
5341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5342     $self->{line_prev} = $self->{line};
5343     $self->{column_prev} = $self->{column};
5344     $self->{column}++;
5345     $self->{nc}
5346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5347     } else {
5348     $self->{set_nc}->($self);
5349     }
5350    
5351     redo A;
5352     } else {
5353     $self->{state} = PI_DATA_STATE;
5354     ## Reprocess.
5355     redo A;
5356     }
5357     } elsif ($self->{state} == PI_DATA_STATE) {
5358     if ($self->{nc} == 0x003F) { # ?
5359     $self->{state} = PI_DATA_AFTER_STATE;
5360    
5361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5362     $self->{line_prev} = $self->{line};
5363     $self->{column_prev} = $self->{column};
5364     $self->{column}++;
5365     $self->{nc}
5366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5367     } else {
5368     $self->{set_nc}->($self);
5369     }
5370    
5371     redo A;
5372     } elsif ($self->{nc} == -1) {
5373     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5374 wakaba 1.13 if ($self->{in_subset}) {
5375 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5376 wakaba 1.13 } else {
5377     $self->{state} = DATA_STATE;
5378     $self->{s_kwd} = '';
5379     }
5380 wakaba 1.8 ## Reprocess.
5381     return ($self->{ct}); # pi
5382     redo A;
5383     } else {
5384     $self->{ct}->{data} .= chr $self->{nc}; # pi
5385     $self->{read_until}->($self->{ct}->{data}, q[?],
5386     length $self->{ct}->{data});
5387     ## Stay in the state.
5388    
5389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5390     $self->{line_prev} = $self->{line};
5391     $self->{column_prev} = $self->{column};
5392     $self->{column}++;
5393     $self->{nc}
5394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5395     } else {
5396     $self->{set_nc}->($self);
5397     }
5398    
5399     ## Reprocess.
5400     redo A;
5401     }
5402     } elsif ($self->{state} == PI_AFTER_STATE) {
5403 wakaba 1.14 ## XML5: Part of "Pi after state".
5404    
5405 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5406 wakaba 1.13 if ($self->{in_subset}) {
5407     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5408     } else {
5409     $self->{state} = DATA_STATE;
5410     $self->{s_kwd} = '';
5411     }
5412 wakaba 1.8
5413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5414     $self->{line_prev} = $self->{line};
5415     $self->{column_prev} = $self->{column};
5416     $self->{column}++;
5417     $self->{nc}
5418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5419     } else {
5420     $self->{set_nc}->($self);
5421     }
5422    
5423     return ($self->{ct}); # pi
5424     redo A;
5425     } elsif ($self->{nc} == 0x003F) { # ?
5426     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5427     line => $self->{line_prev},
5428     column => $self->{column_prev}); ## XML5: no error
5429     $self->{ct}->{data} .= '?';
5430     $self->{state} = PI_DATA_AFTER_STATE;
5431    
5432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5433     $self->{line_prev} = $self->{line};
5434     $self->{column_prev} = $self->{column};
5435     $self->{column}++;
5436     $self->{nc}
5437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5438     } else {
5439     $self->{set_nc}->($self);
5440     }
5441    
5442     redo A;
5443     } else {
5444     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5445     line => $self->{line_prev},
5446     column => $self->{column_prev}
5447     + 1 * ($self->{nc} == -1)); ## XML5: no error
5448     $self->{ct}->{data} .= '?'; ## XML5: not appended
5449     $self->{state} = PI_DATA_STATE;
5450     ## Reprocess.
5451     redo A;
5452     }
5453     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5454 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5455    
5456 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5457 wakaba 1.13 if ($self->{in_subset}) {
5458     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5459     } else {
5460     $self->{state} = DATA_STATE;
5461     $self->{s_kwd} = '';
5462     }
5463 wakaba 1.8
5464     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5465     $self->{line_prev} = $self->{line};
5466     $self->{column_prev} = $self->{column};
5467     $self->{column}++;
5468     $self->{nc}
5469     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5470     } else {
5471     $self->{set_nc}->($self);
5472     }
5473    
5474     return ($self->{ct}); # pi
5475     redo A;
5476     } elsif ($self->{nc} == 0x003F) { # ?
5477     $self->{ct}->{data} .= '?';
5478     ## Stay in the state.
5479    
5480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5481     $self->{line_prev} = $self->{line};
5482     $self->{column_prev} = $self->{column};
5483     $self->{column}++;
5484     $self->{nc}
5485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5486     } else {
5487     $self->{set_nc}->($self);
5488     }
5489    
5490     redo A;
5491     } else {
5492     $self->{ct}->{data} .= '?'; ## XML5: not appended
5493     $self->{state} = PI_DATA_STATE;
5494     ## Reprocess.
5495     redo A;
5496     }
5497 wakaba 1.12
5498     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5499     if ($self->{nc} == 0x003C) { # <
5500 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5501 wakaba 1.12
5502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503     $self->{line_prev} = $self->{line};
5504     $self->{column_prev} = $self->{column};
5505     $self->{column}++;
5506     $self->{nc}
5507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508     } else {
5509     $self->{set_nc}->($self);
5510     }
5511    
5512     redo A;
5513     } elsif ($self->{nc} == 0x0025) { # %
5514     ## XML5: Not defined yet.
5515    
5516     ## TODO:
5517 wakaba 1.24
5518     if (not $self->{stop_processing} and
5519     not $self->{document}->xml_standalone) {
5520     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5521     level => $self->{level}->{info});
5522     $self->{stop_processing} = 1;
5523     }
5524    
5525 wakaba 1.12
5526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527     $self->{line_prev} = $self->{line};
5528     $self->{column_prev} = $self->{column};
5529     $self->{column}++;
5530     $self->{nc}
5531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532     } else {
5533     $self->{set_nc}->($self);
5534     }
5535    
5536     redo A;
5537     } elsif ($self->{nc} == 0x005D) { # ]
5538 wakaba 1.13 delete $self->{in_subset};
5539 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5540    
5541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5542     $self->{line_prev} = $self->{line};
5543     $self->{column_prev} = $self->{column};
5544     $self->{column}++;
5545     $self->{nc}
5546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5547     } else {
5548     $self->{set_nc}->($self);
5549     }
5550    
5551     redo A;
5552     } elsif ($is_space->{$self->{nc}}) {
5553     ## Stay in the state.
5554    
5555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5556     $self->{line_prev} = $self->{line};
5557     $self->{column_prev} = $self->{column};
5558     $self->{column}++;
5559     $self->{nc}
5560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5561     } else {
5562     $self->{set_nc}->($self);
5563     }
5564    
5565     redo A;
5566     } elsif ($self->{nc} == -1) {
5567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5568 wakaba 1.13 delete $self->{in_subset};
5569 wakaba 1.12 $self->{state} = DATA_STATE;
5570     $self->{s_kwd} = '';
5571     ## Reconsume.
5572 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5573 wakaba 1.12 redo A;
5574     } else {
5575     unless ($self->{internal_subset_tainted}) {
5576     ## XML5: No parse error.
5577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5578     $self->{internal_subset_tainted} = 1;
5579     }
5580     ## Stay in the state.
5581    
5582     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5583     $self->{line_prev} = $self->{line};
5584     $self->{column_prev} = $self->{column};
5585     $self->{column}++;
5586     $self->{nc}
5587     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5588     } else {
5589     $self->{set_nc}->($self);
5590     }
5591    
5592     redo A;
5593     }
5594     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5595     if ($self->{nc} == 0x003E) { # >
5596     $self->{state} = DATA_STATE;
5597     $self->{s_kwd} = '';
5598    
5599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5600     $self->{line_prev} = $self->{line};
5601     $self->{column_prev} = $self->{column};
5602     $self->{column}++;
5603     $self->{nc}
5604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5605     } else {
5606     $self->{set_nc}->($self);
5607     }
5608    
5609 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5610 wakaba 1.12 redo A;
5611     } elsif ($self->{nc} == -1) {
5612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5613     $self->{state} = DATA_STATE;
5614     $self->{s_kwd} = '';
5615     ## Reconsume.
5616 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5617 wakaba 1.12 redo A;
5618     } else {
5619     ## XML5: No parse error and stay in the state.
5620     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5621    
5622 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5623    
5624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625     $self->{line_prev} = $self->{line};
5626     $self->{column_prev} = $self->{column};
5627     $self->{column}++;
5628     $self->{nc}
5629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630     } else {
5631     $self->{set_nc}->($self);
5632     }
5633    
5634     redo A;
5635     }
5636     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5637     if ($self->{nc} == 0x003E) { # >
5638     $self->{state} = DATA_STATE;
5639     $self->{s_kwd} = '';
5640    
5641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5642     $self->{line_prev} = $self->{line};
5643     $self->{column_prev} = $self->{column};
5644     $self->{column}++;
5645     $self->{nc}
5646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5647     } else {
5648     $self->{set_nc}->($self);
5649     }
5650    
5651     return ({type => END_OF_DOCTYPE_TOKEN});
5652     redo A;
5653     } elsif ($self->{nc} == -1) {
5654     $self->{state} = DATA_STATE;
5655     $self->{s_kwd} = '';
5656     ## Reconsume.
5657     return ({type => END_OF_DOCTYPE_TOKEN});
5658     redo A;
5659     } else {
5660     ## Stay in the state.
5661    
5662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5663     $self->{line_prev} = $self->{line};
5664     $self->{column_prev} = $self->{column};
5665     $self->{column}++;
5666     $self->{nc}
5667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5668     } else {
5669     $self->{set_nc}->($self);
5670     }
5671    
5672     redo A;
5673     }
5674     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5675     if ($self->{nc} == 0x0021) { # !
5676 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5677 wakaba 1.13
5678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5679     $self->{line_prev} = $self->{line};
5680     $self->{column_prev} = $self->{column};
5681     $self->{column}++;
5682     $self->{nc}
5683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5684     } else {
5685     $self->{set_nc}->($self);
5686     }
5687    
5688     redo A;
5689     } elsif ($self->{nc} == 0x003F) { # ?
5690     $self->{state} = PI_STATE;
5691    
5692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693     $self->{line_prev} = $self->{line};
5694     $self->{column_prev} = $self->{column};
5695     $self->{column}++;
5696     $self->{nc}
5697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698     } else {
5699     $self->{set_nc}->($self);
5700     }
5701    
5702     redo A;
5703     } elsif ($self->{nc} == -1) {
5704     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5705     $self->{state} = DATA_STATE;
5706     $self->{s_kwd} = '';
5707     ## Reconsume.
5708     redo A;
5709     } else {
5710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5711     line => $self->{line_prev},
5712     column => $self->{column_prev});
5713     $self->{state} = BOGUS_COMMENT_STATE;
5714     $self->{ct} = {type => COMMENT_TOKEN,
5715     data => '',
5716     }; ## NOTE: Will be discarded.
5717 wakaba 1.12
5718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5719     $self->{line_prev} = $self->{line};
5720     $self->{column_prev} = $self->{column};
5721     $self->{column}++;
5722     $self->{nc}
5723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5724     } else {
5725     $self->{set_nc}->($self);
5726     }
5727    
5728     redo A;
5729     }
5730 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5731     ## XML5: "DOCTYPE markup declaration state".
5732    
5733     if ($self->{nc} == 0x002D) { # -
5734     $self->{state} = MD_HYPHEN_STATE;
5735    
5736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5737     $self->{line_prev} = $self->{line};
5738     $self->{column_prev} = $self->{column};
5739     $self->{column}++;
5740     $self->{nc}
5741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5742     } else {
5743     $self->{set_nc}->($self);
5744     }
5745    
5746     redo A;
5747 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5748     $self->{nc} == 0x0065) { # e
5749 wakaba 1.14 $self->{state} = MD_E_STATE;
5750     $self->{kwd} = chr $self->{nc};
5751    
5752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5753     $self->{line_prev} = $self->{line};
5754     $self->{column_prev} = $self->{column};
5755     $self->{column}++;
5756     $self->{nc}
5757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5758     } else {
5759     $self->{set_nc}->($self);
5760     }
5761    
5762     redo A;
5763 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5764     $self->{nc} == 0x0061) { # a
5765 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5766     $self->{kwd} = chr $self->{nc};
5767    
5768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769     $self->{line_prev} = $self->{line};
5770     $self->{column_prev} = $self->{column};
5771     $self->{column}++;
5772     $self->{nc}
5773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774     } else {
5775     $self->{set_nc}->($self);
5776     }
5777    
5778     redo A;
5779 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5780     $self->{nc} == 0x006E) { # n
5781 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5782     $self->{kwd} = chr $self->{nc};
5783    
5784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785     $self->{line_prev} = $self->{line};
5786     $self->{column_prev} = $self->{column};
5787     $self->{column}++;
5788     $self->{nc}
5789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790     } else {
5791     $self->{set_nc}->($self);
5792     }
5793    
5794     redo A;
5795     } else {
5796     #
5797     }
5798    
5799     ## XML5: No parse error.
5800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5801     line => $self->{line_prev},
5802     column => $self->{column_prev} - 1);
5803     ## Reconsume.
5804     $self->{state} = BOGUS_COMMENT_STATE;
5805     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5806     redo A;
5807     } elsif ($self->{state} == MD_E_STATE) {
5808 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5809     $self->{nc} == 0x006E) { # n
5810 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5811     $self->{kwd} .= chr $self->{nc};
5812    
5813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5814     $self->{line_prev} = $self->{line};
5815     $self->{column_prev} = $self->{column};
5816     $self->{column}++;
5817     $self->{nc}
5818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5819     } else {
5820     $self->{set_nc}->($self);
5821     }
5822    
5823     redo A;
5824 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5825     $self->{nc} == 0x006C) { # l
5826 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5827     $self->{state} = MD_ELEMENT_STATE;
5828     $self->{kwd} .= chr $self->{nc};
5829    
5830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5831     $self->{line_prev} = $self->{line};
5832     $self->{column_prev} = $self->{column};
5833     $self->{column}++;
5834     $self->{nc}
5835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5836     } else {
5837     $self->{set_nc}->($self);
5838     }
5839    
5840     redo A;
5841     } else {
5842     ## XML5: No parse error.
5843     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5844     line => $self->{line_prev},
5845     column => $self->{column_prev} - 2
5846     + 1 * ($self->{nc} == -1));
5847     ## Reconsume.
5848     $self->{state} = BOGUS_COMMENT_STATE;
5849     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5850     redo A;
5851     }
5852     } elsif ($self->{state} == MD_ENTITY_STATE) {
5853 wakaba 1.17 if ($self->{nc} == [
5854     undef,
5855     undef,
5856     0x0054, # T
5857     0x0049, # I
5858     0x0054, # T
5859     ]->[length $self->{kwd}] or
5860     $self->{nc} == [
5861     undef,
5862     undef,
5863     0x0074, # t
5864     0x0069, # i
5865     0x0074, # t
5866     ]->[length $self->{kwd}]) {
5867 wakaba 1.14 ## Stay in the state.
5868     $self->{kwd} .= chr $self->{nc};
5869    
5870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5871     $self->{line_prev} = $self->{line};
5872     $self->{column_prev} = $self->{column};
5873     $self->{column}++;
5874     $self->{nc}
5875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5876     } else {
5877     $self->{set_nc}->($self);
5878     }
5879    
5880     redo A;
5881 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5882     ($self->{nc} == 0x0059 or # Y
5883     $self->{nc} == 0x0079)) { # y
5884     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5886     text => 'ENTITY',
5887     line => $self->{line_prev},
5888     column => $self->{column_prev} - 4);
5889     }
5890     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5891 wakaba 1.14 line => $self->{line_prev},
5892     column => $self->{column_prev} - 6};
5893     $self->{state} = DOCTYPE_MD_STATE;
5894    
5895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5896     $self->{line_prev} = $self->{line};
5897     $self->{column_prev} = $self->{column};
5898     $self->{column}++;
5899     $self->{nc}
5900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5901     } else {
5902     $self->{set_nc}->($self);
5903     }
5904    
5905     redo A;
5906     } else {
5907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5908     line => $self->{line_prev},
5909     column => $self->{column_prev} - 1
5910     - (length $self->{kwd})
5911     + 1 * ($self->{nc} == -1));
5912     $self->{state} = BOGUS_COMMENT_STATE;
5913     ## Reconsume.
5914     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5915     redo A;
5916     }
5917     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5918 wakaba 1.17 if ($self->{nc} == [
5919     undef,
5920     undef,
5921     0x0045, # E
5922     0x004D, # M
5923     0x0045, # E
5924     0x004E, # N
5925     ]->[length $self->{kwd}] or
5926     $self->{nc} == [
5927     undef,
5928     undef,
5929     0x0065, # e
5930     0x006D, # m
5931     0x0065, # e
5932     0x006E, # n
5933     ]->[length $self->{kwd}]) {
5934 wakaba 1.14 ## Stay in the state.
5935     $self->{kwd} .= chr $self->{nc};
5936    
5937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938     $self->{line_prev} = $self->{line};
5939     $self->{column_prev} = $self->{column};
5940     $self->{column}++;
5941     $self->{nc}
5942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943     } else {
5944     $self->{set_nc}->($self);
5945     }
5946    
5947     redo A;
5948 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5949     ($self->{nc} == 0x0054 or # T
5950     $self->{nc} == 0x0074)) { # t
5951     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5953     text => 'ELEMENT',
5954     line => $self->{line_prev},
5955     column => $self->{column_prev} - 5);
5956     }
5957 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5958     line => $self->{line_prev},
5959 wakaba 1.23 column => $self->{column_prev} - 7};
5960 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5961    
5962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5963     $self->{line_prev} = $self->{line};
5964     $self->{column_prev} = $self->{column};
5965     $self->{column}++;
5966     $self->{nc}
5967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5968     } else {
5969     $self->{set_nc}->($self);
5970     }
5971    
5972     redo A;
5973     } else {
5974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5975     line => $self->{line_prev},
5976     column => $self->{column_prev} - 1
5977     - (length $self->{kwd})
5978     + 1 * ($self->{nc} == -1));
5979     $self->{state} = BOGUS_COMMENT_STATE;
5980     ## Reconsume.
5981     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5982     redo A;
5983     }
5984     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5985 wakaba 1.17 if ($self->{nc} == [
5986     undef,
5987     0x0054, # T
5988     0x0054, # T
5989     0x004C, # L
5990     0x0049, # I
5991     0x0053, # S
5992     ]->[length $self->{kwd}] or
5993     $self->{nc} == [
5994     undef,
5995     0x0074, # t
5996     0x0074, # t
5997     0x006C, # l
5998     0x0069, # i
5999     0x0073, # s
6000     ]->[length $self->{kwd}]) {
6001 wakaba 1.14 ## Stay in the state.
6002     $self->{kwd} .= chr $self->{nc};
6003    
6004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6005     $self->{line_prev} = $self->{line};
6006     $self->{column_prev} = $self->{column};
6007     $self->{column}++;
6008     $self->{nc}
6009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6010     } else {
6011     $self->{set_nc}->($self);
6012     }
6013    
6014     redo A;
6015 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6016     ($self->{nc} == 0x0054 or # T
6017     $self->{nc} == 0x0074)) { # t
6018     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6020     text => 'ATTLIST',
6021     line => $self->{line_prev},
6022     column => $self->{column_prev} - 5);
6023     }
6024 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6025 wakaba 1.15 attrdefs => [],
6026 wakaba 1.14 line => $self->{line_prev},
6027 wakaba 1.23 column => $self->{column_prev} - 7};
6028 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6029    
6030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6031     $self->{line_prev} = $self->{line};
6032     $self->{column_prev} = $self->{column};
6033     $self->{column}++;
6034     $self->{nc}
6035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6036     } else {
6037     $self->{set_nc}->($self);
6038     }
6039    
6040     redo A;
6041     } else {
6042     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6043     line => $self->{line_prev},
6044     column => $self->{column_prev} - 1
6045     - (length $self->{kwd})
6046     + 1 * ($self->{nc} == -1));
6047     $self->{state} = BOGUS_COMMENT_STATE;
6048     ## Reconsume.
6049     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6050     redo A;
6051     }
6052     } elsif ($self->{state} == MD_NOTATION_STATE) {
6053 wakaba 1.17 if ($self->{nc} == [
6054     undef,
6055     0x004F, # O
6056     0x0054, # T
6057     0x0041, # A
6058     0x0054, # T
6059     0x0049, # I
6060     0x004F, # O
6061     ]->[length $self->{kwd}] or
6062     $self->{nc} == [
6063     undef,
6064     0x006F, # o
6065     0x0074, # t
6066     0x0061, # a
6067     0x0074, # t
6068     0x0069, # i
6069     0x006F, # o
6070     ]->[length $self->{kwd}]) {
6071 wakaba 1.14 ## Stay in the state.
6072     $self->{kwd} .= chr $self->{nc};
6073    
6074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6075     $self->{line_prev} = $self->{line};
6076     $self->{column_prev} = $self->{column};
6077     $self->{column}++;
6078     $self->{nc}
6079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6080     } else {
6081     $self->{set_nc}->($self);
6082     }
6083    
6084     redo A;
6085 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6086     ($self->{nc} == 0x004E or # N
6087     $self->{nc} == 0x006E)) { # n
6088     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6090     text => 'NOTATION',
6091     line => $self->{line_prev},
6092     column => $self->{column_prev} - 6);
6093     }
6094 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6095     line => $self->{line_prev},
6096 wakaba 1.23 column => $self->{column_prev} - 8};
6097 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6098    
6099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6100     $self->{line_prev} = $self->{line};
6101     $self->{column_prev} = $self->{column};
6102     $self->{column}++;
6103     $self->{nc}
6104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6105     } else {
6106     $self->{set_nc}->($self);
6107     }
6108    
6109     redo A;
6110     } else {
6111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6112     line => $self->{line_prev},
6113     column => $self->{column_prev} - 1
6114     - (length $self->{kwd})
6115     + 1 * ($self->{nc} == -1));
6116     $self->{state} = BOGUS_COMMENT_STATE;
6117     ## Reconsume.
6118     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6119     redo A;
6120     }
6121     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6122     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6123     ## "DOCTYPE NOTATION state".
6124    
6125     if ($is_space->{$self->{nc}}) {
6126     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6127     $self->{state} = BEFORE_MD_NAME_STATE;
6128    
6129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6130     $self->{line_prev} = $self->{line};
6131     $self->{column_prev} = $self->{column};
6132     $self->{column}++;
6133     $self->{nc}
6134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6135     } else {
6136     $self->{set_nc}->($self);
6137     }
6138    
6139     redo A;
6140     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6141     $self->{nc} == 0x0025) { # %
6142     ## XML5: Switch to the "DOCTYPE bogus comment state".
6143     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6144     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6145    
6146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6147     $self->{line_prev} = $self->{line};
6148     $self->{column_prev} = $self->{column};
6149     $self->{column}++;
6150     $self->{nc}
6151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6152     } else {
6153     $self->{set_nc}->($self);
6154     }
6155    
6156     redo A;
6157     } elsif ($self->{nc} == -1) {
6158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6159     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6160     ## Reconsume.
6161     redo A;
6162     } elsif ($self->{nc} == 0x003E) { # >
6163     ## XML5: Switch to the "DOCTYPE bogus comment state".
6164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6165     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6166    
6167     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6168     $self->{line_prev} = $self->{line};
6169     $self->{column_prev} = $self->{column};
6170     $self->{column}++;
6171     $self->{nc}
6172     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6173     } else {
6174     $self->{set_nc}->($self);
6175     }
6176    
6177     redo A;
6178     } else {
6179     ## XML5: Switch to the "DOCTYPE bogus comment state".
6180     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6181     $self->{state} = BEFORE_MD_NAME_STATE;
6182     redo A;
6183     }
6184     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6185     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6186     ## before state", "DOCTYPE ATTLIST name before state".
6187    
6188     if ($is_space->{$self->{nc}}) {
6189     ## Stay in the state.
6190    
6191     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6192     $self->{line_prev} = $self->{line};
6193     $self->{column_prev} = $self->{column};
6194     $self->{column}++;
6195     $self->{nc}
6196     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6197     } else {
6198     $self->{set_nc}->($self);
6199     }
6200    
6201     redo A;
6202     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6203     $self->{nc} == 0x0025) { # %
6204     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6205    
6206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6207     $self->{line_prev} = $self->{line};
6208     $self->{column_prev} = $self->{column};
6209     $self->{column}++;
6210     $self->{nc}
6211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6212     } else {
6213     $self->{set_nc}->($self);
6214     }
6215    
6216     redo A;
6217     } elsif ($self->{nc} == 0x003E) { # >
6218     ## XML5: Same as "Anything else".
6219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6220     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6221    
6222     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6223     $self->{line_prev} = $self->{line};
6224     $self->{column_prev} = $self->{column};
6225     $self->{column}++;
6226     $self->{nc}
6227     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6228     } else {
6229     $self->{set_nc}->($self);
6230     }
6231    
6232     redo A;
6233     } elsif ($self->{nc} == -1) {
6234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6236     ## Reconsume.
6237     redo A;
6238     } else {
6239     ## XML5: [ATTLIST] Not defined yet.
6240     $self->{ct}->{name} .= chr $self->{nc};
6241     $self->{state} = MD_NAME_STATE;
6242    
6243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6244     $self->{line_prev} = $self->{line};
6245     $self->{column_prev} = $self->{column};
6246     $self->{column}++;
6247     $self->{nc}
6248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6249     } else {
6250     $self->{set_nc}->($self);
6251     }
6252    
6253     redo A;
6254     }
6255     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6256     if ($is_space->{$self->{nc}}) {
6257     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6258     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6259     $self->{state} = BEFORE_MD_NAME_STATE;
6260 wakaba 1.8
6261 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6262     $self->{line_prev} = $self->{line};
6263     $self->{column_prev} = $self->{column};
6264     $self->{column}++;
6265     $self->{nc}
6266     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6267     } else {
6268     $self->{set_nc}->($self);
6269     }
6270    
6271     redo A;
6272     } elsif ($self->{nc} == 0x003E) { # >
6273     ## XML5: Same as "Anything else".
6274     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6275     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6276    
6277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6278     $self->{line_prev} = $self->{line};
6279     $self->{column_prev} = $self->{column};
6280     $self->{column}++;
6281     $self->{nc}
6282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6283     } else {
6284     $self->{set_nc}->($self);
6285     }
6286    
6287     redo A;
6288     } elsif ($self->{nc} == -1) {
6289     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6291     ## Reconsume.
6292     redo A;
6293     } else {
6294     ## XML5: No parse error.
6295     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6296     $self->{state} = BOGUS_COMMENT_STATE;
6297     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6298     ## Reconsume.
6299     redo A;
6300     }
6301     } elsif ($self->{state} == MD_NAME_STATE) {
6302     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6303    
6304     if ($is_space->{$self->{nc}}) {
6305 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6306     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6307     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6308 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6309 wakaba 1.16 } else { # ENTITY/NOTATION
6310     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6311     }
6312 wakaba 1.14
6313     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6314     $self->{line_prev} = $self->{line};
6315     $self->{column_prev} = $self->{column};
6316     $self->{column}++;
6317     $self->{nc}
6318     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6319     } else {
6320     $self->{set_nc}->($self);
6321     }
6322    
6323     redo A;
6324     } elsif ($self->{nc} == 0x003E) { # >
6325     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6326     #
6327     } else {
6328 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6329 wakaba 1.14 }
6330     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6331    
6332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6333     $self->{line_prev} = $self->{line};
6334     $self->{column_prev} = $self->{column};
6335     $self->{column}++;
6336     $self->{nc}
6337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6338     } else {
6339     $self->{set_nc}->($self);
6340     }
6341    
6342     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6343     redo A;
6344     } elsif ($self->{nc} == -1) {
6345     ## XML5: [ATTLIST] No parse error.
6346     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6348     ## Reconsume.
6349     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6350     redo A;
6351     } else {
6352     ## XML5: [ATTLIST] Not defined yet.
6353     $self->{ct}->{name} .= chr $self->{nc};
6354     ## Stay in the state.
6355    
6356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6357     $self->{line_prev} = $self->{line};
6358     $self->{column_prev} = $self->{column};
6359     $self->{column}++;
6360     $self->{nc}
6361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6362     } else {
6363     $self->{set_nc}->($self);
6364     }
6365    
6366     redo A;
6367     }
6368     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6369     if ($is_space->{$self->{nc}}) {
6370     ## Stay in the state.
6371    
6372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373     $self->{line_prev} = $self->{line};
6374     $self->{column_prev} = $self->{column};
6375     $self->{column}++;
6376     $self->{nc}
6377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378     } else {
6379     $self->{set_nc}->($self);
6380     }
6381    
6382     redo A;
6383     } elsif ($self->{nc} == 0x003E) { # >
6384     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6385    
6386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6387     $self->{line_prev} = $self->{line};
6388     $self->{column_prev} = $self->{column};
6389     $self->{column}++;
6390     $self->{nc}
6391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6392     } else {
6393     $self->{set_nc}->($self);
6394     }
6395    
6396     return ($self->{ct}); # ATTLIST
6397     redo A;
6398     } elsif ($self->{nc} == -1) {
6399     ## XML5: No parse error.
6400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6402 wakaba 1.15 return ($self->{ct});
6403 wakaba 1.14 redo A;
6404     } else {
6405     ## XML5: Not defined yet.
6406 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6407     tokens => [],
6408     line => $self->{line}, column => $self->{column}};
6409     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6410    
6411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6412     $self->{line_prev} = $self->{line};
6413     $self->{column_prev} = $self->{column};
6414     $self->{column}++;
6415     $self->{nc}
6416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6417     } else {
6418     $self->{set_nc}->($self);
6419     }
6420    
6421     redo A;
6422     }
6423     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6424     if ($is_space->{$self->{nc}}) {
6425     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6426    
6427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428     $self->{line_prev} = $self->{line};
6429     $self->{column_prev} = $self->{column};
6430     $self->{column}++;
6431     $self->{nc}
6432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433     } else {
6434     $self->{set_nc}->($self);
6435     }
6436    
6437     redo A;
6438     } elsif ($self->{nc} == 0x003E) { # >
6439     ## XML5: Same as "anything else".
6440     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6441     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6442    
6443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444     $self->{line_prev} = $self->{line};
6445     $self->{column_prev} = $self->{column};
6446     $self->{column}++;
6447     $self->{nc}
6448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449     } else {
6450     $self->{set_nc}->($self);
6451     }
6452    
6453     return ($self->{ct}); # ATTLIST
6454     redo A;
6455     } elsif ($self->{nc} == 0x0028) { # (
6456     ## XML5: Same as "anything else".
6457     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6458     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6459    
6460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6461     $self->{line_prev} = $self->{line};
6462     $self->{column_prev} = $self->{column};
6463     $self->{column}++;
6464     $self->{nc}
6465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6466     } else {
6467     $self->{set_nc}->($self);
6468     }
6469    
6470     redo A;
6471     } elsif ($self->{nc} == -1) {
6472     ## XML5: No parse error.
6473     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6474     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6475    
6476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6477     $self->{line_prev} = $self->{line};
6478     $self->{column_prev} = $self->{column};
6479     $self->{column}++;
6480     $self->{nc}
6481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6482     } else {
6483     $self->{set_nc}->($self);
6484     }
6485    
6486     return ($self->{ct}); # ATTLIST
6487     redo A;
6488     } else {
6489     ## XML5: Not defined yet.
6490     $self->{ca}->{name} .= chr $self->{nc};
6491     ## Stay in the state.
6492    
6493     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6494     $self->{line_prev} = $self->{line};
6495     $self->{column_prev} = $self->{column};
6496     $self->{column}++;
6497     $self->{nc}
6498     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6499     } else {
6500     $self->{set_nc}->($self);
6501     }
6502    
6503 wakaba 1.14 redo A;
6504     }
6505 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6506     if ($is_space->{$self->{nc}}) {
6507     ## Stay in the state.
6508    
6509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6510     $self->{line_prev} = $self->{line};
6511     $self->{column_prev} = $self->{column};
6512     $self->{column}++;
6513     $self->{nc}
6514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6515     } else {
6516     $self->{set_nc}->($self);
6517     }
6518    
6519     redo A;
6520     } elsif ($self->{nc} == 0x003E) { # >
6521     ## XML5: Same as "anything else".
6522     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6523     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6524    
6525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6526     $self->{line_prev} = $self->{line};
6527     $self->{column_prev} = $self->{column};
6528     $self->{column}++;
6529     $self->{nc}
6530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6531     } else {
6532     $self->{set_nc}->($self);
6533     }
6534    
6535     return ($self->{ct}); # ATTLIST
6536     redo A;
6537     } elsif ($self->{nc} == 0x0028) { # (
6538     ## XML5: Same as "anything else".
6539     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6540    
6541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6542     $self->{line_prev} = $self->{line};
6543     $self->{column_prev} = $self->{column};
6544     $self->{column}++;
6545     $self->{nc}
6546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6547     } else {
6548     $self->{set_nc}->($self);
6549     }
6550    
6551     redo A;
6552     } elsif ($self->{nc} == -1) {
6553     ## XML5: No parse error.
6554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6555     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6556    
6557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6558     $self->{line_prev} = $self->{line};
6559     $self->{column_prev} = $self->{column};
6560     $self->{column}++;
6561     $self->{nc}
6562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6563     } else {
6564     $self->{set_nc}->($self);
6565     }
6566    
6567     return ($self->{ct});
6568     redo A;
6569     } else {
6570     ## XML5: Not defined yet.
6571     $self->{ca}->{type} = chr $self->{nc};
6572     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6573    
6574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575     $self->{line_prev} = $self->{line};
6576     $self->{column_prev} = $self->{column};
6577     $self->{column}++;
6578     $self->{nc}
6579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580     } else {
6581     $self->{set_nc}->($self);
6582     }
6583    
6584     redo A;
6585     }
6586     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6587     if ($is_space->{$self->{nc}}) {
6588     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6589    
6590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6591     $self->{line_prev} = $self->{line};
6592     $self->{column_prev} = $self->{column};
6593     $self->{column}++;
6594     $self->{nc}
6595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6596     } else {
6597     $self->{set_nc}->($self);
6598     }
6599    
6600     redo A;
6601     } elsif ($self->{nc} == 0x0023) { # #
6602     ## XML5: Same as "anything else".
6603     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6604     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6605    
6606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6607     $self->{line_prev} = $self->{line};
6608     $self->{column_prev} = $self->{column};
6609     $self->{column}++;
6610     $self->{nc}
6611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6612     } else {
6613     $self->{set_nc}->($self);
6614     }
6615    
6616     redo A;
6617     } elsif ($self->{nc} == 0x0022) { # "
6618     ## XML5: Same as "anything else".
6619     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6620     $self->{ca}->{value} = '';
6621     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6622    
6623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6624     $self->{line_prev} = $self->{line};
6625     $self->{column_prev} = $self->{column};
6626     $self->{column}++;
6627     $self->{nc}
6628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6629     } else {
6630     $self->{set_nc}->($self);
6631     }
6632    
6633     redo A;
6634     } elsif ($self->{nc} == 0x0027) { # '
6635     ## XML5: Same as "anything else".
6636     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6637     $self->{ca}->{value} = '';
6638     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6639    
6640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6641     $self->{line_prev} = $self->{line};
6642     $self->{column_prev} = $self->{column};
6643     $self->{column}++;
6644     $self->{nc}
6645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6646     } else {
6647     $self->{set_nc}->($self);
6648     }
6649    
6650     redo A;
6651     } elsif ($self->{nc} == 0x003E) { # >
6652     ## XML5: Same as "anything else".
6653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6654     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6655    
6656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6657     $self->{line_prev} = $self->{line};
6658     $self->{column_prev} = $self->{column};
6659     $self->{column}++;
6660     $self->{nc}
6661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6662     } else {
6663     $self->{set_nc}->($self);
6664     }
6665    
6666     return ($self->{ct}); # ATTLIST
6667     redo A;
6668     } elsif ($self->{nc} == 0x0028) { # (
6669     ## XML5: Same as "anything else".
6670     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6671     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6672    
6673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6674     $self->{line_prev} = $self->{line};
6675     $self->{column_prev} = $self->{column};
6676     $self->{column}++;
6677     $self->{nc}
6678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6679     } else {
6680     $self->{set_nc}->($self);
6681     }
6682    
6683     redo A;
6684     } elsif ($self->{nc} == -1) {
6685     ## XML5: No parse error.
6686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6687     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6688    
6689     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6690     $self->{line_prev} = $self->{line};
6691     $self->{column_prev} = $self->{column};
6692     $self->{column}++;
6693     $self->{nc}
6694     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6695     } else {
6696     $self->{set_nc}->($self);
6697     }
6698    
6699     return ($self->{ct});
6700     redo A;
6701     } else {
6702     ## XML5: Not defined yet.
6703     $self->{ca}->{type} .= chr $self->{nc};
6704     ## Stay in the state.
6705    
6706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6707     $self->{line_prev} = $self->{line};
6708     $self->{column_prev} = $self->{column};
6709     $self->{column}++;
6710     $self->{nc}
6711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6712     } else {
6713     $self->{set_nc}->($self);
6714     }
6715    
6716     redo A;
6717     }
6718     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6719     if ($is_space->{$self->{nc}}) {
6720     ## Stay in the state.
6721    
6722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6723     $self->{line_prev} = $self->{line};
6724     $self->{column_prev} = $self->{column};
6725     $self->{column}++;
6726     $self->{nc}
6727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6728     } else {
6729     $self->{set_nc}->($self);
6730     }
6731    
6732     redo A;
6733     } elsif ($self->{nc} == 0x0028) { # (
6734     ## XML5: Same as "anything else".
6735     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6736    
6737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6738     $self->{line_prev} = $self->{line};
6739     $self->{column_prev} = $self->{column};
6740     $self->{column}++;
6741     $self->{nc}
6742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6743     } else {
6744     $self->{set_nc}->($self);
6745     }
6746    
6747     redo A;
6748     } elsif ($self->{nc} == 0x0023) { # #
6749     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6750    
6751     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6752     $self->{line_prev} = $self->{line};
6753     $self->{column_prev} = $self->{column};
6754     $self->{column}++;
6755     $self->{nc}
6756     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6757     } else {
6758     $self->{set_nc}->($self);
6759     }
6760    
6761     redo A;
6762     } elsif ($self->{nc} == 0x0022) { # "
6763     ## XML5: Same as "anything else".
6764     $self->{ca}->{value} = '';
6765     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6766    
6767     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6768     $self->{line_prev} = $self->{line};
6769     $self->{column_prev} = $self->{column};
6770     $self->{column}++;
6771     $self->{nc}
6772     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6773     } else {
6774     $self->{set_nc}->($self);
6775     }
6776    
6777     redo A;
6778     } elsif ($self->{nc} == 0x0027) { # '
6779     ## XML5: Same as "anything else".
6780     $self->{ca}->{value} = '';
6781     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6782    
6783     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6784     $self->{line_prev} = $self->{line};
6785     $self->{column_prev} = $self->{column};
6786     $self->{column}++;
6787     $self->{nc}
6788     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6789     } else {
6790     $self->{set_nc}->($self);
6791     }
6792    
6793     redo A;
6794     } elsif ($self->{nc} == 0x003E) { # >
6795     ## XML5: Same as "anything else".
6796     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6797     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6798    
6799     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6800     $self->{line_prev} = $self->{line};
6801     $self->{column_prev} = $self->{column};
6802     $self->{column}++;
6803     $self->{nc}
6804     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6805     } else {
6806     $self->{set_nc}->($self);
6807     }
6808    
6809     return ($self->{ct}); # ATTLIST
6810     redo A;
6811     } elsif ($self->{nc} == -1) {
6812     ## XML5: No parse error.
6813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6814     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6815    
6816     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6817     $self->{line_prev} = $self->{line};
6818     $self->{column_prev} = $self->{column};
6819     $self->{column}++;
6820     $self->{nc}
6821     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6822     } else {
6823     $self->{set_nc}->($self);
6824     }
6825    
6826     return ($self->{ct});
6827     redo A;
6828     } else {
6829     ## XML5: Switch to the "DOCTYPE bogus comment state".
6830     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6831     $self->{ca}->{value} = '';
6832     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6833     ## Reconsume.
6834     redo A;
6835     }
6836     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6837     if ($is_space->{$self->{nc}}) {
6838     ## Stay in the state.
6839    
6840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6841     $self->{line_prev} = $self->{line};
6842     $self->{column_prev} = $self->{column};
6843     $self->{column}++;
6844     $self->{nc}
6845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6846     } else {
6847     $self->{set_nc}->($self);
6848     }
6849    
6850     redo A;
6851     } elsif ($self->{nc} == 0x007C) { # |
6852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6853     ## Stay in the state.
6854    
6855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6856     $self->{line_prev} = $self->{line};
6857     $self->{column_prev} = $self->{column};
6858     $self->{column}++;
6859     $self->{nc}
6860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6861     } else {
6862     $self->{set_nc}->($self);
6863     }
6864    
6865     redo A;
6866     } elsif ($self->{nc} == 0x0029) { # )
6867     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6868     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6869    
6870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6871     $self->{line_prev} = $self->{line};
6872     $self->{column_prev} = $self->{column};
6873     $self->{column}++;
6874     $self->{nc}
6875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6876     } else {
6877     $self->{set_nc}->($self);
6878     }
6879    
6880     redo A;
6881     } elsif ($self->{nc} == 0x003E) { # >
6882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6884    
6885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6886     $self->{line_prev} = $self->{line};
6887     $self->{column_prev} = $self->{column};
6888     $self->{column}++;
6889     $self->{nc}
6890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6891     } else {
6892     $self->{set_nc}->($self);
6893     }
6894    
6895     return ($self->{ct}); # ATTLIST
6896     redo A;
6897     } elsif ($self->{nc} == -1) {
6898     ## XML5: No parse error.
6899     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6900     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6901    
6902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6903     $self->{line_prev} = $self->{line};
6904     $self->{column_prev} = $self->{column};
6905     $self->{column}++;
6906     $self->{nc}
6907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6908     } else {
6909     $self->{set_nc}->($self);
6910     }
6911    
6912     return ($self->{ct});
6913     redo A;
6914     } else {
6915     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6916     $self->{state} = ALLOWED_TOKEN_STATE;
6917    
6918     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6919     $self->{line_prev} = $self->{line};
6920     $self->{column_prev} = $self->{column};
6921     $self->{column}++;
6922     $self->{nc}
6923     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6924     } else {
6925     $self->{set_nc}->($self);
6926     }
6927    
6928     redo A;
6929     }
6930     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6931     if ($is_space->{$self->{nc}}) {
6932     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6933    
6934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6935     $self->{line_prev} = $self->{line};
6936     $self->{column_prev} = $self->{column};
6937     $self->{column}++;
6938     $self->{nc}
6939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6940     } else {
6941     $self->{set_nc}->($self);
6942     }
6943    
6944     redo A;
6945     } elsif ($self->{nc} == 0x007C) { # |
6946     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6947    
6948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6949     $self->{line_prev} = $self->{line};
6950     $self->{column_prev} = $self->{column};
6951     $self->{column}++;
6952     $self->{nc}
6953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6954     } else {
6955     $self->{set_nc}->($self);
6956     }
6957    
6958     redo A;
6959     } elsif ($self->{nc} == 0x0029) { # )
6960     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6961    
6962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6963     $self->{line_prev} = $self->{line};
6964     $self->{column_prev} = $self->{column};
6965     $self->{column}++;
6966     $self->{nc}
6967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6968     } else {
6969     $self->{set_nc}->($self);
6970     }
6971    
6972     redo A;
6973     } elsif ($self->{nc} == 0x003E) { # >
6974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6975     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6976    
6977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6978     $self->{line_prev} = $self->{line};
6979     $self->{column_prev} = $self->{column};
6980     $self->{column}++;
6981     $self->{nc}
6982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6983     } else {
6984     $self->{set_nc}->($self);
6985     }
6986    
6987     return ($self->{ct}); # ATTLIST
6988     redo A;
6989     } elsif ($self->{nc} == -1) {
6990     ## XML5: No parse error.
6991     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6992     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6993    
6994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6995     $self->{line_prev} = $self->{line};
6996     $self->{column_prev} = $self->{column};
6997     $self->{column}++;
6998     $self->{nc}
6999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7000     } else {
7001     $self->{set_nc}->($self);
7002     }
7003    
7004     return ($self->{ct});
7005     redo A;
7006     } else {
7007     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7008     ## Stay in the state.
7009    
7010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7011     $self->{line_prev} = $self->{line};
7012     $self->{column_prev} = $self->{column};
7013     $self->{column}++;
7014     $self->{nc}
7015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7016     } else {
7017     $self->{set_nc}->($self);
7018     }
7019    
7020     redo A;
7021     }
7022     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7023     if ($is_space->{$self->{nc}}) {
7024     ## Stay in the state.
7025    
7026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7027     $self->{line_prev} = $self->{line};
7028     $self->{column_prev} = $self->{column};
7029     $self->{column}++;
7030     $self->{nc}
7031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7032     } else {
7033     $self->{set_nc}->($self);
7034     }
7035    
7036     redo A;
7037     } elsif ($self->{nc} == 0x007C) { # |
7038     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7039    
7040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7041     $self->{line_prev} = $self->{line};
7042     $self->{column_prev} = $self->{column};
7043     $self->{column}++;
7044     $self->{nc}
7045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7046     } else {
7047     $self->{set_nc}->($self);
7048     }
7049    
7050     redo A;
7051     } elsif ($self->{nc} == 0x0029) { # )
7052     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7053    
7054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7055     $self->{line_prev} = $self->{line};
7056     $self->{column_prev} = $self->{column};
7057     $self->{column}++;
7058     $self->{nc}
7059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7060     } else {
7061     $self->{set_nc}->($self);
7062     }
7063    
7064     redo A;
7065     } elsif ($self->{nc} == 0x003E) { # >
7066     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7067     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7068    
7069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7070     $self->{line_prev} = $self->{line};
7071     $self->{column_prev} = $self->{column};
7072     $self->{column}++;
7073     $self->{nc}
7074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7075     } else {
7076     $self->{set_nc}->($self);
7077     }
7078    
7079     return ($self->{ct}); # ATTLIST
7080     redo A;
7081     } elsif ($self->{nc} == -1) {
7082     ## XML5: No parse error.
7083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7085    
7086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7087     $self->{line_prev} = $self->{line};
7088     $self->{column_prev} = $self->{column};
7089     $self->{column}++;
7090     $self->{nc}
7091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7092     } else {
7093     $self->{set_nc}->($self);
7094     }
7095    
7096     return ($self->{ct});
7097     redo A;
7098     } else {
7099     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7100     line => $self->{line_prev},
7101     column => $self->{column_prev});
7102     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7103     $self->{state} = ALLOWED_TOKEN_STATE;
7104    
7105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7106     $self->{line_prev} = $self->{line};
7107     $self->{column_prev} = $self->{column};
7108     $self->{column}++;
7109     $self->{nc}
7110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7111     } else {
7112     $self->{set_nc}->($self);
7113     }
7114    
7115     redo A;
7116     }
7117     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7118     if ($is_space->{$self->{nc}}) {
7119     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7120    
7121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122     $self->{line_prev} = $self->{line};
7123     $self->{column_prev} = $self->{column};
7124     $self->{column}++;
7125     $self->{nc}
7126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127     } else {
7128     $self->{set_nc}->($self);
7129     }
7130    
7131     redo A;
7132     } elsif ($self->{nc} == 0x0023) { # #
7133     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7134     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7135    
7136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7137     $self->{line_prev} = $self->{line};
7138     $self->{column_prev} = $self->{column};
7139     $self->{column}++;
7140     $self->{nc}
7141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7142     } else {
7143     $self->{set_nc}->($self);
7144     }
7145    
7146     redo A;
7147     } elsif ($self->{nc} == 0x0022) { # "
7148     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7149     $self->{ca}->{value} = '';
7150     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7151    
7152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7153     $self->{line_prev} = $self->{line};
7154     $self->{column_prev} = $self->{column};
7155     $self->{column}++;
7156     $self->{nc}
7157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7158     } else {
7159     $self->{set_nc}->($self);
7160     }
7161    
7162     redo A;
7163     } elsif ($self->{nc} == 0x0027) { # '
7164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7165     $self->{ca}->{value} = '';
7166     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7167    
7168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7169     $self->{line_prev} = $self->{line};
7170     $self->{column_prev} = $self->{column};
7171     $self->{column}++;
7172     $self->{nc}
7173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7174     } else {
7175     $self->{set_nc}->($self);
7176     }
7177    
7178     redo A;
7179     } elsif ($self->{nc} == 0x003E) { # >
7180     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7181     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7182    
7183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7184     $self->{line_prev} = $self->{line};
7185     $self->{column_prev} = $self->{column};
7186     $self->{column}++;
7187     $self->{nc}
7188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7189     } else {
7190     $self->{set_nc}->($self);
7191     }
7192    
7193     return ($self->{ct}); # ATTLIST
7194     redo A;
7195     } elsif ($self->{nc} == -1) {
7196     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7197     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198    
7199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200     $self->{line_prev} = $self->{line};
7201     $self->{column_prev} = $self->{column};
7202     $self->{column}++;
7203     $self->{nc}
7204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205     } else {
7206     $self->{set_nc}->($self);
7207     }
7208    
7209     return ($self->{ct});
7210     redo A;
7211     } else {
7212     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7213     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7214     ## Reconsume.
7215     redo A;
7216     }
7217     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7218     if ($is_space->{$self->{nc}}) {
7219     ## Stay in the state.
7220    
7221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7222     $self->{line_prev} = $self->{line};
7223     $self->{column_prev} = $self->{column};
7224     $self->{column}++;
7225     $self->{nc}
7226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7227     } else {
7228     $self->{set_nc}->($self);
7229     }
7230    
7231     redo A;
7232     } elsif ($self->{nc} == 0x0023) { # #
7233     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7234    
7235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236     $self->{line_prev} = $self->{line};
7237     $self->{column_prev} = $self->{column};
7238     $self->{column}++;
7239     $self->{nc}
7240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241     } else {
7242     $self->{set_nc}->($self);
7243     }
7244    
7245     redo A;
7246     } elsif ($self->{nc} == 0x0022) { # "
7247     $self->{ca}->{value} = '';
7248     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7249    
7250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7251     $self->{line_prev} = $self->{line};
7252     $self->{column_prev} = $self->{column};
7253     $self->{column}++;
7254     $self->{nc}
7255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7256     } else {
7257     $self->{set_nc}->($self);
7258     }
7259    
7260     redo A;
7261     } elsif ($self->{nc} == 0x0027) { # '
7262     $self->{ca}->{value} = '';
7263     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7264    
7265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7266     $self->{line_prev} = $self->{line};
7267     $self->{column_prev} = $self->{column};
7268     $self->{column}++;
7269     $self->{nc}
7270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7271     } else {
7272     $self->{set_nc}->($self);
7273     }
7274    
7275     redo A;
7276     } elsif ($self->{nc} == 0x003E) { # >
7277     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7278     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7279    
7280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7281     $self->{line_prev} = $self->{line};
7282     $self->{column_prev} = $self->{column};
7283     $self->{column}++;
7284     $self->{nc}
7285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7286     } else {
7287     $self->{set_nc}->($self);
7288     }
7289    
7290     return ($self->{ct}); # ATTLIST
7291     redo A;
7292     } elsif ($self->{nc} == -1) {
7293     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7294     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7295    
7296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7297     $self->{line_prev} = $self->{line};
7298     $self->{column_prev} = $self->{column};
7299     $self->{column}++;
7300     $self->{nc}
7301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7302     } else {
7303     $self->{set_nc}->($self);
7304     }
7305    
7306     return ($self->{ct});
7307     redo A;
7308     } else {
7309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7310     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7311     ## Reconsume.
7312     redo A;
7313     }
7314     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7315     if ($is_space->{$self->{nc}}) {
7316     ## XML5: No parse error.
7317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7318 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7319 wakaba 1.15 ## Reconsume.
7320     redo A;
7321     } elsif ($self->{nc} == 0x0022) { # "
7322     ## XML5: Same as "anything else".
7323     $self->{ca}->{value} = '';
7324     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7325    
7326     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7327     $self->{line_prev} = $self->{line};
7328     $self->{column_prev} = $self->{column};
7329     $self->{column}++;
7330     $self->{nc}
7331     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7332     } else {
7333     $self->{set_nc}->($self);
7334     }
7335    
7336     redo A;
7337     } elsif ($self->{nc} == 0x0027) { # '
7338     ## XML5: Same as "anything else".
7339     $self->{ca}->{value} = '';
7340     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7341    
7342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7343     $self->{line_prev} = $self->{line};
7344     $self->{column_prev} = $self->{column};
7345     $self->{column}++;
7346     $self->{nc}
7347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7348     } else {
7349     $self->{set_nc}->($self);
7350     }
7351    
7352     redo A;
7353     } elsif ($self->{nc} == 0x003E) { # >
7354     ## XML5: Same as "anything else".
7355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7356     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7357    
7358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7359     $self->{line_prev} = $self->{line};
7360     $self->{column_prev} = $self->{column};
7361     $self->{column}++;
7362     $self->{nc}
7363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7364     } else {
7365     $self->{set_nc}->($self);
7366     }
7367    
7368     return ($self->{ct}); # ATTLIST
7369     redo A;
7370     } elsif ($self->{nc} == -1) {
7371     ## XML5: No parse error.
7372     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7374    
7375     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7376     $self->{line_prev} = $self->{line};
7377     $self->{column_prev} = $self->{column};
7378     $self->{column}++;
7379     $self->{nc}
7380     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7381     } else {
7382     $self->{set_nc}->($self);
7383     }
7384    
7385     return ($self->{ct});
7386     redo A;
7387     } else {
7388     $self->{ca}->{default} = chr $self->{nc};
7389     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7390    
7391     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7392     $self->{line_prev} = $self->{line};
7393     $self->{column_prev} = $self->{column};
7394     $self->{column}++;
7395     $self->{nc}
7396     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7397     } else {
7398     $self->{set_nc}->($self);
7399     }
7400    
7401     redo A;
7402     }
7403     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7404     if ($is_space->{$self->{nc}}) {
7405     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7406    
7407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7408     $self->{line_prev} = $self->{line};
7409     $self->{column_prev} = $self->{column};
7410     $self->{column}++;
7411     $self->{nc}
7412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7413     } else {
7414     $self->{set_nc}->($self);
7415     }
7416    
7417     redo A;
7418     } elsif ($self->{nc} == 0x0022) { # "
7419     ## XML5: Same as "anything else".
7420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7421     $self->{ca}->{value} = '';
7422     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7423    
7424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7425     $self->{line_prev} = $self->{line};
7426     $self->{column_prev} = $self->{column};
7427     $self->{column}++;
7428     $self->{nc}
7429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7430     } else {
7431     $self->{set_nc}->($self);
7432     }
7433    
7434     redo A;
7435     } elsif ($self->{nc} == 0x0027) { # '
7436     ## XML5: Same as "anything else".
7437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7438     $self->{ca}->{value} = '';
7439     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7440    
7441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7442     $self->{line_prev} = $self->{line};
7443     $self->{column_prev} = $self->{column};
7444     $self->{column}++;
7445     $self->{nc}
7446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7447     } else {
7448     $self->{set_nc}->($self);
7449     }
7450    
7451     redo A;
7452     } elsif ($self->{nc} == 0x003E) { # >
7453     ## XML5: Same as "anything else".
7454     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7455     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7456    
7457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7458     $self->{line_prev} = $self->{line};
7459     $self->{column_prev} = $self->{column};
7460     $self->{column}++;
7461     $self->{nc}
7462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7463     } else {
7464     $self->{set_nc}->($self);
7465     }
7466    
7467     return ($self->{ct}); # ATTLIST
7468     redo A;
7469     } elsif ($self->{nc} == -1) {
7470     ## XML5: No parse error.
7471     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7472     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7473     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7474    
7475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7476     $self->{line_prev} = $self->{line};
7477     $self->{column_prev} = $self->{column};
7478     $self->{column}++;
7479     $self->{nc}
7480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7481     } else {
7482     $self->{set_nc}->($self);
7483     }
7484    
7485     return ($self->{ct});
7486     redo A;
7487     } else {
7488     $self->{ca}->{default} .= chr $self->{nc};
7489     ## Stay in the state.
7490    
7491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7492     $self->{line_prev} = $self->{line};
7493     $self->{column_prev} = $self->{column};
7494     $self->{column}++;
7495     $self->{nc}
7496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7497     } else {
7498     $self->{set_nc}->($self);
7499     }
7500    
7501     redo A;
7502     }
7503     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7504     if ($is_space->{$self->{nc}}) {
7505     ## Stay in the state.
7506    
7507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7508     $self->{line_prev} = $self->{line};
7509     $self->{column_prev} = $self->{column};
7510     $self->{column}++;
7511     $self->{nc}
7512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7513     } else {
7514     $self->{set_nc}->($self);
7515     }
7516    
7517     redo A;
7518     } elsif ($self->{nc} == 0x0022) { # "
7519     $self->{ca}->{value} = '';
7520     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7521    
7522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7523     $self->{line_prev} = $self->{line};
7524     $self->{column_prev} = $self->{column};
7525     $self->{column}++;
7526     $self->{nc}
7527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7528     } else {
7529     $self->{set_nc}->($self);
7530     }
7531    
7532     redo A;
7533     } elsif ($self->{nc} == 0x0027) { # '
7534     $self->{ca}->{value} = '';
7535     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7536    
7537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7538     $self->{line_prev} = $self->{line};
7539     $self->{column_prev} = $self->{column};
7540     $self->{column}++;
7541     $self->{nc}
7542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7543     } else {
7544     $self->{set_nc}->($self);
7545     }
7546    
7547     redo A;
7548     } elsif ($self->{nc} == 0x003E) { # >
7549     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7551    
7552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7553     $self->{line_prev} = $self->{line};
7554     $self->{column_prev} = $self->{column};
7555     $self->{column}++;
7556     $self->{nc}
7557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7558     } else {
7559     $self->{set_nc}->($self);
7560     }
7561    
7562     return ($self->{ct}); # ATTLIST
7563     redo A;
7564     } elsif ($self->{nc} == -1) {
7565     ## XML5: No parse error.
7566     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7567     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7568     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7569    
7570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7571     $self->{line_prev} = $self->{line};
7572     $self->{column_prev} = $self->{column};
7573     $self->{column}++;
7574     $self->{nc}
7575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7576     } else {
7577     $self->{set_nc}->($self);
7578     }
7579    
7580     return ($self->{ct});
7581     redo A;
7582     } else {
7583     ## XML5: Not defined yet.
7584     if ($self->{ca}->{default} eq 'FIXED') {
7585     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7586     } else {
7587     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7588     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7589     }
7590     ## Reconsume.
7591     redo A;
7592     }
7593     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7594     if ($is_space->{$self->{nc}} or
7595     $self->{nc} == -1 or
7596     $self->{nc} == 0x003E) { # >
7597     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7598     ## Reconsume.
7599     redo A;
7600     } else {
7601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7602     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7603     ## Reconsume.
7604     redo A;
7605 wakaba 1.16 }
7606 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7607     ## ASCII case-insensitive
7608     if ($self->{nc} == [
7609     undef,
7610     0x0044, # D
7611     0x0041, # A
7612     0x0054, # T
7613     ]->[length $self->{kwd}] or
7614     $self->{nc} == [
7615     undef,
7616     0x0064, # d
7617     0x0061, # a
7618     0x0074, # t
7619     ]->[length $self->{kwd}]) {
7620    
7621     ## Stay in the state.
7622     $self->{kwd} .= chr $self->{nc};
7623    
7624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7625     $self->{line_prev} = $self->{line};
7626     $self->{column_prev} = $self->{column};
7627     $self->{column}++;
7628     $self->{nc}
7629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7630     } else {
7631     $self->{set_nc}->($self);
7632     }
7633    
7634     redo A;
7635     } elsif ((length $self->{kwd}) == 4 and
7636     ($self->{nc} == 0x0041 or # A
7637     $self->{nc} == 0x0061)) { # a
7638     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7639    
7640     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7641     text => 'NDATA',
7642     line => $self->{line_prev},
7643     column => $self->{column_prev} - 4);
7644     } else {
7645    
7646     }
7647     $self->{state} = AFTER_NDATA_STATE;
7648    
7649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7650     $self->{line_prev} = $self->{line};
7651     $self->{column_prev} = $self->{column};
7652     $self->{column}++;
7653     $self->{nc}
7654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7655     } else {
7656     $self->{set_nc}->($self);
7657     }
7658    
7659     redo A;
7660     } else {
7661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7662     line => $self->{line_prev},
7663     column => $self->{column_prev} + 1
7664     - length $self->{kwd});
7665    
7666     $self->{state} = BOGUS_MD_STATE;
7667     ## Reconsume.
7668     redo A;
7669     }
7670     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7671     if ($is_space->{$self->{nc}}) {
7672     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7673    
7674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7675     $self->{line_prev} = $self->{line};
7676     $self->{column_prev} = $self->{column};
7677     $self->{column}++;
7678     $self->{nc}
7679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7680     } else {
7681     $self->{set_nc}->($self);
7682     }
7683    
7684     redo A;
7685     } elsif ($self->{nc} == 0x003E) { # >
7686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7687     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7688    
7689     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7690     $self->{line_prev} = $self->{line};
7691     $self->{column_prev} = $self->{column};
7692     $self->{column}++;
7693     $self->{nc}
7694     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7695     } else {
7696     $self->{set_nc}->($self);
7697     }
7698    
7699     return ($self->{ct}); # ENTITY
7700     redo A;
7701     } elsif ($self->{nc} == -1) {
7702     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7703     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7704    
7705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7706     $self->{line_prev} = $self->{line};
7707     $self->{column_prev} = $self->{column};
7708     $self->{column}++;
7709     $self->{nc}
7710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7711     } else {
7712     $self->{set_nc}->($self);
7713     }
7714    
7715     return ($self->{ct}); # ENTITY
7716     redo A;
7717     } else {
7718     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7719     line => $self->{line_prev},
7720     column => $self->{column_prev} + 1
7721     - length $self->{kwd});
7722     $self->{state} = BOGUS_MD_STATE;
7723     ## Reconsume.
7724     redo A;
7725     }
7726     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7727     if ($is_space->{$self->{nc}}) {
7728     ## Stay in the state.
7729    
7730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7731     $self->{line_prev} = $self->{line};
7732     $self->{column_prev} = $self->{column};
7733     $self->{column}++;
7734     $self->{nc}
7735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7736     } else {
7737     $self->{set_nc}->($self);
7738     }
7739    
7740     redo A;
7741     } elsif ($self->{nc} == 0x003E) { # >
7742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7743     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7744    
7745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7746     $self->{line_prev} = $self->{line};
7747     $self->{column_prev} = $self->{column};
7748     $self->{column}++;
7749     $self->{nc}
7750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7751     } else {
7752     $self->{set_nc}->($self);
7753     }
7754    
7755     return ($self->{ct}); # ENTITY
7756     redo A;
7757     } elsif ($self->{nc} == -1) {
7758     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7759     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7760    
7761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7762     $self->{line_prev} = $self->{line};
7763     $self->{column_prev} = $self->{column};
7764     $self->{column}++;
7765     $self->{nc}
7766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7767     } else {
7768     $self->{set_nc}->($self);
7769     }
7770    
7771     return ($self->{ct}); # ENTITY
7772     redo A;
7773     } else {
7774     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7775     $self->{state} = NOTATION_NAME_STATE;
7776    
7777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7778     $self->{line_prev} = $self->{line};
7779     $self->{column_prev} = $self->{column};
7780     $self->{column}++;
7781     $self->{nc}
7782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7783     } else {
7784     $self->{set_nc}->($self);
7785     }
7786    
7787     redo A;
7788     }
7789     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7790     if ($is_space->{$self->{nc}}) {
7791 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7792 wakaba 1.18
7793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7794     $self->{line_prev} = $self->{line};
7795     $self->{column_prev} = $self->{column};
7796     $self->{column}++;
7797     $self->{nc}
7798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7799     } else {
7800     $self->{set_nc}->($self);
7801     }
7802    
7803     redo A;
7804     } elsif ($self->{nc} == 0x003E) { # >
7805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7806    
7807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7808     $self->{line_prev} = $self->{line};
7809     $self->{column_prev} = $self->{column};
7810     $self->{column}++;
7811     $self->{nc}
7812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7813     } else {
7814     $self->{set_nc}->($self);
7815     }
7816    
7817     return ($self->{ct}); # ENTITY
7818     redo A;
7819     } elsif ($self->{nc} == -1) {
7820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7822    
7823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7824     $self->{line_prev} = $self->{line};
7825     $self->{column_prev} = $self->{column};
7826     $self->{column}++;
7827     $self->{nc}
7828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7829     } else {
7830     $self->{set_nc}->($self);
7831     }
7832    
7833     return ($self->{ct}); # ENTITY
7834     redo A;
7835     } else {
7836     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7837     ## Stay in the state.
7838    
7839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7840     $self->{line_prev} = $self->{line};
7841     $self->{column_prev} = $self->{column};
7842     $self->{column}++;
7843     $self->{nc}
7844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7845     } else {
7846     $self->{set_nc}->($self);
7847     }
7848    
7849     redo A;
7850     }
7851 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7852     if ($self->{nc} == 0x0022) { # "
7853 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7854 wakaba 1.19
7855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856     $self->{line_prev} = $self->{line};
7857     $self->{column_prev} = $self->{column};
7858     $self->{column}++;
7859     $self->{nc}
7860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861     } else {
7862     $self->{set_nc}->($self);
7863     }
7864    
7865     redo A;
7866     } elsif ($self->{nc} == 0x0026) { # &
7867     $self->{prev_state} = $self->{state};
7868     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7869     $self->{entity_add} = 0x0022; # "
7870    
7871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872     $self->{line_prev} = $self->{line};
7873     $self->{column_prev} = $self->{column};
7874     $self->{column}++;
7875     $self->{nc}
7876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877     } else {
7878     $self->{set_nc}->($self);
7879     }
7880    
7881     redo A;
7882     ## TODO: %
7883     } elsif ($self->{nc} == -1) {
7884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7886     ## Reconsume.
7887     return ($self->{ct}); # ENTITY
7888     redo A;
7889     } else {
7890     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7891    
7892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7893     $self->{line_prev} = $self->{line};
7894     $self->{column_prev} = $self->{column};
7895     $self->{column}++;
7896     $self->{nc}
7897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7898     } else {
7899     $self->{set_nc}->($self);
7900     }
7901    
7902     redo A;
7903     }
7904     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7905     if ($self->{nc} == 0x0027) { # '
7906 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7907 wakaba 1.19
7908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7909     $self->{line_prev} = $self->{line};
7910     $self->{column_prev} = $self->{column};
7911     $self->{column}++;
7912     $self->{nc}
7913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7914     } else {
7915     $self->{set_nc}->($self);
7916     }
7917    
7918     redo A;
7919     } elsif ($self->{nc} == 0x0026) { # &
7920     $self->{prev_state} = $self->{state};
7921     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7922     $self->{entity_add} = 0x0027; # '
7923    
7924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7925     $self->{line_prev} = $self->{line};
7926     $self->{column_prev} = $self->{column};
7927     $self->{column}++;
7928     $self->{nc}
7929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7930     } else {
7931     $self->{set_nc}->($self);
7932     }
7933    
7934     redo A;
7935     ## TODO: %
7936     } elsif ($self->{nc} == -1) {
7937     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7938     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7939     ## Reconsume.
7940     return ($self->{ct}); # ENTITY
7941     redo A;
7942     } else {
7943     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7944    
7945     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7946     $self->{line_prev} = $self->{line};
7947     $self->{column_prev} = $self->{column};
7948     $self->{column}++;
7949     $self->{nc}
7950     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7951     } else {
7952     $self->{set_nc}->($self);
7953     }
7954    
7955     redo A;
7956     }
7957     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7958     if ($is_space->{$self->{nc}} or
7959     {
7960     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7961     $self->{entity_add} => 1,
7962     }->{$self->{nc}}) {
7963 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7964     line => $self->{line_prev},
7965     column => $self->{column_prev}
7966     + ($self->{nc} == -1 ? 1 : 0));
7967 wakaba 1.19 ## Don't consume
7968     ## Return nothing.
7969     #
7970     } elsif ($self->{nc} == 0x0023) { # #
7971     $self->{ca} = $self->{ct};
7972     $self->{state} = ENTITY_HASH_STATE;
7973     $self->{kwd} = '#';
7974    
7975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7976     $self->{line_prev} = $self->{line};
7977     $self->{column_prev} = $self->{column};
7978     $self->{column}++;
7979     $self->{nc}
7980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7981     } else {
7982     $self->{set_nc}->($self);
7983     }
7984    
7985     redo A;
7986     } else {
7987     #
7988     }
7989    
7990     $self->{ct}->{value} .= '&';
7991     $self->{state} = $self->{prev_state};
7992     ## Reconsume.
7993     redo A;
7994 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7995     if ($is_space->{$self->{nc}}) {
7996     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7997    
7998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7999     $self->{line_prev} = $self->{line};
8000     $self->{column_prev} = $self->{column};
8001     $self->{column}++;
8002     $self->{nc}
8003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8004     } else {
8005     $self->{set_nc}->($self);
8006     }
8007    
8008     redo A;
8009     } elsif ($self->{nc} == 0x0028) { # (
8010     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8011     $self->{ct}->{content} = ['('];
8012     $self->{group_depth} = 1;
8013    
8014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8015     $self->{line_prev} = $self->{line};
8016     $self->{column_prev} = $self->{column};
8017     $self->{column}++;
8018     $self->{nc}
8019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8020     } else {
8021     $self->{set_nc}->($self);
8022     }
8023    
8024     redo A;
8025     } elsif ($self->{nc} == 0x003E) { # >
8026     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8027     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8028    
8029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8030     $self->{line_prev} = $self->{line};
8031     $self->{column_prev} = $self->{column};
8032     $self->{column}++;
8033     $self->{nc}
8034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8035     } else {
8036     $self->{set_nc}->($self);
8037     }
8038    
8039     return ($self->{ct}); # ELEMENT
8040     redo A;
8041     } elsif ($self->{nc} == -1) {
8042     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8043     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8044    
8045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8046     $self->{line_prev} = $self->{line};
8047     $self->{column_prev} = $self->{column};
8048     $self->{column}++;
8049     $self->{nc}
8050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8051     } else {
8052     $self->{set_nc}->($self);
8053     }
8054    
8055     return ($self->{ct}); # ELEMENT
8056     redo A;
8057     } else {
8058     $self->{ct}->{content} = [chr $self->{nc}];
8059     $self->{state} = CONTENT_KEYWORD_STATE;
8060    
8061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8062     $self->{line_prev} = $self->{line};
8063     $self->{column_prev} = $self->{column};
8064     $self->{column}++;
8065     $self->{nc}
8066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8067     } else {
8068     $self->{set_nc}->($self);
8069     }
8070    
8071     redo A;
8072     }
8073     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8074     if ($is_space->{$self->{nc}}) {
8075     $self->{state} = AFTER_MD_DEF_STATE;
8076    
8077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8078     $self->{line_prev} = $self->{line};
8079     $self->{column_prev} = $self->{column};
8080     $self->{column}++;
8081     $self->{nc}
8082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8083     } else {
8084     $self->{set_nc}->($self);
8085     }
8086    
8087     redo A;
8088     } elsif ($self->{nc} == 0x003E) { # >
8089     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8090    
8091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8092     $self->{line_prev} = $self->{line};
8093     $self->{column_prev} = $self->{column};
8094     $self->{column}++;
8095     $self->{nc}
8096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8097     } else {
8098     $self->{set_nc}->($self);
8099     }
8100    
8101     return ($self->{ct}); # ELEMENT
8102     redo A;
8103     } elsif ($self->{nc} == -1) {
8104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8106    
8107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8108     $self->{line_prev} = $self->{line};
8109     $self->{column_prev} = $self->{column};
8110     $self->{column}++;
8111     $self->{nc}
8112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8113     } else {
8114     $self->{set_nc}->($self);
8115     }
8116    
8117     return ($self->{ct}); # ELEMENT
8118     redo A;
8119     } else {
8120     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8121     ## Stay in the state.
8122    
8123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8124     $self->{line_prev} = $self->{line};
8125     $self->{column_prev} = $self->{column};
8126     $self->{column}++;
8127     $self->{nc}
8128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8129     } else {
8130     $self->{set_nc}->($self);
8131     }
8132    
8133     redo A;
8134     }
8135     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8136     if ($is_space->{$self->{nc}}) {
8137     ## Stay in the state.
8138    
8139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8140     $self->{line_prev} = $self->{line};
8141     $self->{column_prev} = $self->{column};
8142     $self->{column}++;
8143     $self->{nc}
8144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8145     } else {
8146     $self->{set_nc}->($self);
8147     }
8148    
8149     redo A;
8150     } elsif ($self->{nc} == 0x0028) { # (
8151     $self->{group_depth}++;
8152     push @{$self->{ct}->{content}}, chr $self->{nc};
8153     ## Stay in the state.
8154    
8155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8156     $self->{line_prev} = $self->{line};
8157     $self->{column_prev} = $self->{column};
8158     $self->{column}++;
8159     $self->{nc}
8160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8161     } else {
8162     $self->{set_nc}->($self);
8163     }
8164    
8165     redo A;
8166     } elsif ($self->{nc} == 0x007C or # |
8167     $self->{nc} == 0x002C) { # ,
8168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8169     ## Stay in the state.
8170    
8171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8172     $self->{line_prev} = $self->{line};
8173     $self->{column_prev} = $self->{column};
8174     $self->{column}++;
8175     $self->{nc}
8176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8177     } else {
8178     $self->{set_nc}->($self);
8179     }
8180    
8181     redo A;
8182     } elsif ($self->{nc} == 0x0029) { # )
8183     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8184     push @{$self->{ct}->{content}}, chr $self->{nc};
8185     $self->{group_depth}--;
8186     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8187    
8188     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8189     $self->{line_prev} = $self->{line};
8190     $self->{column_prev} = $self->{column};
8191     $self->{column}++;
8192     $self->{nc}
8193     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8194     } else {
8195     $self->{set_nc}->($self);
8196     }
8197    
8198     redo A;
8199     } elsif ($self->{nc} == 0x003E) { # >
8200     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8201     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8202     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8203    
8204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8205     $self->{line_prev} = $self->{line};
8206     $self->{column_prev} = $self->{column};
8207     $self->{column}++;
8208     $self->{nc}
8209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8210     } else {
8211     $self->{set_nc}->($self);
8212     }
8213    
8214     return ($self->{ct}); # ELEMENT
8215     redo A;
8216     } elsif ($self->{nc} == -1) {
8217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8218     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8219     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8220    
8221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8222     $self->{line_prev} = $self->{line};
8223     $self->{column_prev} = $self->{column};
8224     $self->{column}++;
8225     $self->{nc}
8226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8227     } else {
8228     $self->{set_nc}->($self);
8229     }
8230    
8231     return ($self->{ct}); # ELEMENT
8232     redo A;
8233     } else {
8234     push @{$self->{ct}->{content}}, chr $self->{nc};
8235     $self->{state} = CM_ELEMENT_NAME_STATE;
8236    
8237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8238     $self->{line_prev} = $self->{line};
8239     $self->{column_prev} = $self->{column};
8240     $self->{column}++;
8241     $self->{nc}
8242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8243     } else {
8244     $self->{set_nc}->($self);
8245     }
8246    
8247     redo A;
8248     }
8249     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8250     if ($is_space->{$self->{nc}}) {
8251     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8252    
8253     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8254     $self->{line_prev} = $self->{line};
8255     $self->{column_prev} = $self->{column};
8256     $self->{column}++;
8257     $self->{nc}
8258     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8259     } else {
8260     $self->{set_nc}->($self);
8261     }
8262    
8263     redo A;
8264     } elsif ($self->{nc} == 0x002A or # *
8265     $self->{nc} == 0x002B or # +
8266     $self->{nc} == 0x003F) { # ?
8267     push @{$self->{ct}->{content}}, chr $self->{nc};
8268     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8269    
8270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8271     $self->{line_prev} = $self->{line};
8272     $self->{column_prev} = $self->{column};
8273     $self->{column}++;
8274     $self->{nc}
8275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8276     } else {
8277     $self->{set_nc}->($self);
8278     }
8279    
8280     redo A;
8281     } elsif ($self->{nc} == 0x007C or # |
8282     $self->{nc} == 0x002C) { # ,
8283     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8284     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8285    
8286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8287     $self->{line_prev} = $self->{line};
8288     $self->{column_prev} = $self->{column};
8289     $self->{column}++;
8290     $self->{nc}
8291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8292     } else {
8293     $self->{set_nc}->($self);
8294     }
8295    
8296     redo A;
8297     } elsif ($self->{nc} == 0x0029) { # )
8298     $self->{group_depth}--;
8299     push @{$self->{ct}->{content}}, chr $self->{nc};
8300     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8301    
8302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8303     $self->{line_prev} = $self->{line};
8304     $self->{column_prev} = $self->{column};
8305     $self->{column}++;
8306     $self->{nc}
8307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8308     } else {
8309     $self->{set_nc}->($self);
8310     }
8311    
8312     redo A;
8313     } elsif ($self->{nc} == 0x003E) { # >
8314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8315     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8316     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8317    
8318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8319     $self->{line_prev} = $self->{line};
8320     $self->{column_prev} = $self->{column};
8321     $self->{column}++;
8322     $self->{nc}
8323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8324     } else {
8325     $self->{set_nc}->($self);
8326     }
8327    
8328     return ($self->{ct}); # ELEMENT
8329     redo A;
8330     } elsif ($self->{nc} == -1) {
8331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8332     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8333     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8334    
8335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8336     $self->{line_prev} = $self->{line};
8337     $self->{column_prev} = $self->{column};
8338     $self->{column}++;
8339     $self->{nc}
8340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8341     } else {
8342     $self->{set_nc}->($self);
8343     }
8344    
8345     return ($self->{ct}); # ELEMENT
8346     redo A;
8347     } else {
8348     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8349     ## Stay in the state.
8350    
8351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8352     $self->{line_prev} = $self->{line};
8353     $self->{column_prev} = $self->{column};
8354     $self->{column}++;
8355     $self->{nc}
8356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8357     } else {
8358     $self->{set_nc}->($self);
8359     }
8360    
8361     redo A;
8362     }
8363     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8364     if ($is_space->{$self->{nc}}) {
8365     ## Stay in the state.
8366    
8367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8368     $self->{line_prev} = $self->{line};
8369     $self->{column_prev} = $self->{column};
8370     $self->{column}++;
8371     $self->{nc}
8372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8373     } else {
8374     $self->{set_nc}->($self);
8375     }
8376    
8377     redo A;
8378     } elsif ($self->{nc} == 0x007C or # |
8379     $self->{nc} == 0x002C) { # ,
8380     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8381     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8382    
8383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8384     $self->{line_prev} = $self->{line};
8385     $self->{column_prev} = $self->{column};
8386     $self->{column}++;
8387     $self->{nc}
8388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8389     } else {
8390     $self->{set_nc}->($self);
8391     }
8392    
8393     redo A;
8394     } elsif ($self->{nc} == 0x0029) { # )
8395     $self->{group_depth}--;
8396     push @{$self->{ct}->{content}}, chr $self->{nc};
8397     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8398    
8399     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8400     $self->{line_prev} = $self->{line};
8401     $self->{column_prev} = $self->{column};
8402     $self->{column}++;
8403     $self->{nc}
8404     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8405     } else {
8406     $self->{set_nc}->($self);
8407     }
8408    
8409     redo A;
8410     } elsif ($self->{nc} == 0x003E) { # >
8411     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8412     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8413     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8414    
8415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8416     $self->{line_prev} = $self->{line};
8417     $self->{column_prev} = $self->{column};
8418     $self->{column}++;
8419     $self->{nc}
8420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8421     } else {
8422     $self->{set_nc}->($self);
8423     }
8424    
8425     return ($self->{ct}); # ELEMENT
8426     redo A;
8427     } elsif ($self->{nc} == -1) {
8428     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8429     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8430     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8431    
8432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8433     $self->{line_prev} = $self->{line};
8434     $self->{column_prev} = $self->{column};
8435     $self->{column}++;
8436     $self->{nc}
8437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8438     } else {
8439     $self->{set_nc}->($self);
8440     }
8441    
8442     return ($self->{ct}); # ELEMENT
8443     redo A;
8444     } else {
8445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8446     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8447     $self->{state} = BOGUS_MD_STATE;
8448    
8449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8450     $self->{line_prev} = $self->{line};
8451     $self->{column_prev} = $self->{column};
8452     $self->{column}++;
8453     $self->{nc}
8454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8455     } else {
8456     $self->{set_nc}->($self);
8457     }
8458    
8459     redo A;
8460     }
8461     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8462     if ($is_space->{$self->{nc}}) {
8463     if ($self->{group_depth}) {
8464     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8465     } else {
8466     $self->{state} = AFTER_MD_DEF_STATE;
8467     }
8468    
8469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8470     $self->{line_prev} = $self->{line};
8471     $self->{column_prev} = $self->{column};
8472     $self->{column}++;
8473     $self->{nc}
8474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8475     } else {
8476     $self->{set_nc}->($self);
8477     }
8478    
8479     redo A;
8480     } elsif ($self->{nc} == 0x002A or # *
8481     $self->{nc} == 0x002B or # +
8482     $self->{nc} == 0x003F) { # ?
8483     push @{$self->{ct}->{content}}, chr $self->{nc};
8484     if ($self->{group_depth}) {
8485     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8486     } else {
8487     $self->{state} = AFTER_MD_DEF_STATE;
8488     }
8489    
8490     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8491     $self->{line_prev} = $self->{line};
8492     $self->{column_prev} = $self->{column};
8493     $self->{column}++;
8494     $self->{nc}
8495     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8496     } else {
8497     $self->{set_nc}->($self);
8498     }
8499    
8500     redo A;
8501     } elsif ($self->{nc} == 0x0029) { # )
8502     if ($self->{group_depth}) {
8503     $self->{group_depth}--;
8504     push @{$self->{ct}->{content}}, chr $self->{nc};
8505     ## Stay in the state.
8506    
8507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8508     $self->{line_prev} = $self->{line};
8509     $self->{column_prev} = $self->{column};
8510     $self->{column}++;
8511     $self->{nc}
8512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8513     } else {
8514     $self->{set_nc}->($self);
8515     }
8516    
8517     redo A;
8518     } else {
8519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8520     $self->{state} = BOGUS_MD_STATE;
8521     ## Reconsume.
8522     redo A;
8523     }
8524     } elsif ($self->{nc} == 0x003E) { # >
8525     if ($self->{group_depth}) {
8526     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8527     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8528     }
8529     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8530    
8531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8532     $self->{line_prev} = $self->{line};
8533     $self->{column_prev} = $self->{column};
8534     $self->{column}++;
8535     $self->{nc}
8536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8537     } else {
8538     $self->{set_nc}->($self);
8539     }
8540    
8541     return ($self->{ct}); # ELEMENT
8542     redo A;
8543     } elsif ($self->{nc} == -1) {
8544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8545     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8546     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8547    
8548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8549     $self->{line_prev} = $self->{line};
8550     $self->{column_prev} = $self->{column};
8551     $self->{column}++;
8552     $self->{nc}
8553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8554     } else {
8555     $self->{set_nc}->($self);
8556     }
8557    
8558     return ($self->{ct}); # ELEMENT
8559     redo A;
8560     } else {
8561     if ($self->{group_depth}) {
8562     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8563     } else {
8564     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8565     $self->{state} = BOGUS_MD_STATE;
8566     }
8567     ## Reconsume.
8568     redo A;
8569     }
8570     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8571 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8572     ## Stay in the state.
8573    
8574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8575     $self->{line_prev} = $self->{line};
8576     $self->{column_prev} = $self->{column};
8577     $self->{column}++;
8578     $self->{nc}
8579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8580     } else {
8581     $self->{set_nc}->($self);
8582     }
8583    
8584     redo A;
8585     } elsif ($self->{nc} == 0x003E) { # >
8586     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8587    
8588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8589     $self->{line_prev} = $self->{line};
8590     $self->{column_prev} = $self->{column};
8591     $self->{column}++;
8592     $self->{nc}
8593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8594     } else {
8595     $self->{set_nc}->($self);
8596     }
8597    
8598 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8599 wakaba 1.18 redo A;
8600     } elsif ($self->{nc} == -1) {
8601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8602     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8603    
8604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8605     $self->{line_prev} = $self->{line};
8606     $self->{column_prev} = $self->{column};
8607     $self->{column}++;
8608     $self->{nc}
8609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8610     } else {
8611     $self->{set_nc}->($self);
8612     }
8613    
8614 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8615 wakaba 1.18 redo A;
8616     } else {
8617 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8618 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8619     ## Reconsume.
8620     redo A;
8621     }
8622 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8623     if ($self->{nc} == 0x003E) { # >
8624     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8625    
8626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8627     $self->{line_prev} = $self->{line};
8628     $self->{column_prev} = $self->{column};
8629     $self->{column}++;
8630     $self->{nc}
8631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8632     } else {
8633     $self->{set_nc}->($self);
8634     }
8635    
8636     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8637     redo A;
8638     } elsif ($self->{nc} == -1) {
8639     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8640     ## Reconsume.
8641     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8642     redo A;
8643     } else {
8644     ## Stay in the state.
8645    
8646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8647     $self->{line_prev} = $self->{line};
8648     $self->{column_prev} = $self->{column};
8649     $self->{column}++;
8650     $self->{nc}
8651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8652     } else {
8653     $self->{set_nc}->($self);
8654     }
8655    
8656     redo A;
8657     }
8658 wakaba 1.1 } else {
8659     die "$0: $self->{state}: Unknown state";
8660     }
8661     } # A
8662    
8663     die "$0: _get_next_token: unexpected case";
8664     } # _get_next_token
8665    
8666     1;
8667 wakaba 1.28 ## $Date: 2009/07/02 22:24:28 $
8668 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24