/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (hide annotations) (download)
Sun Oct 19 15:17:01 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.24: +45 -7 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 15:16:55 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Normalization tests added.  Test
	results updated.

	* charrefs-1.dat: Character reference parse error/mapping tests
	added.

	* attlists-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 15:13:57 -0000
	* Tokenizer.pm.src: Normalize white space characters in attribute
	value literals in XML documents.  Don't apply character reference
	mapping table for non-NULL non-surrogate code points.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.25 our $VERSION=do{my @r=(q$Revision: 1.24 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743     if ($self->{nc} == 0x003D) { # =
1744    
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819 wakaba 1.25 } elsif ($self->{is_xml} and
1820     $is_space->{$self->{nc}}) {
1821    
1822     $self->{ca}->{value} .= ' ';
1823     ## Stay in the state.
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836 wakaba 1.1 } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 wakaba 1.15
1842     $self->{state} = DATA_STATE;
1843     $self->{s_kwd} = '';
1844     ## reconsume
1845     return ($self->{ct}); # start tag
1846     redo A;
1847 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849     if ($self->{ct}->{attributes}) {
1850    
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852     } else {
1853     ## NOTE: This state should never be reached.
1854    
1855     }
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # end tag
1861     redo A;
1862     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863     ## XML5: No parse error above; not defined yet.
1864     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866     ## Reconsume.
1867     return ($self->{ct}); # ATTLIST
1868     redo A;
1869 wakaba 1.1 } else {
1870     die "$0: $self->{ct}->{type}: Unknown token type";
1871     }
1872     } else {
1873 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1874 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875    
1876     ## XML5: Not a parse error.
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878     } else {
1879    
1880     }
1881 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1882     $self->{read_until}->($self->{ca}->{value},
1883 wakaba 1.25 qq["&<\x09\x0C\x20],
1884 wakaba 1.1 length $self->{ca}->{value});
1885    
1886     ## Stay in the state
1887    
1888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889     $self->{line_prev} = $self->{line};
1890     $self->{column_prev} = $self->{column};
1891     $self->{column}++;
1892     $self->{nc}
1893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894     } else {
1895     $self->{set_nc}->($self);
1896     }
1897    
1898     redo A;
1899     }
1900     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902     ## ATTLIST attribute value single quoted state".
1903 wakaba 1.11
1904 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1905 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906    
1907     ## XML5: "DOCTYPE ATTLIST name after state".
1908     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910     } else {
1911    
1912     ## XML5: "Before attribute name state" (sic).
1913     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914     }
1915 wakaba 1.1
1916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917     $self->{line_prev} = $self->{line};
1918     $self->{column_prev} = $self->{column};
1919     $self->{column}++;
1920     $self->{nc}
1921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922     } else {
1923     $self->{set_nc}->($self);
1924     }
1925    
1926     redo A;
1927     } elsif ($self->{nc} == 0x0026) { # &
1928    
1929 wakaba 1.11 ## XML5: Not defined yet.
1930    
1931 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1932     ## "entity in attribute value state". In this implementation, the
1933     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934     ## implementation of the "consume a character reference" algorithm.
1935     $self->{entity_add} = 0x0027; # '
1936     $self->{prev_state} = $self->{state};
1937     $self->{state} = ENTITY_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949     redo A;
1950 wakaba 1.25 } elsif ($self->{is_xml} and
1951     $is_space->{$self->{nc}}) {
1952    
1953     $self->{ca}->{value} .= ' ';
1954     ## Stay in the state.
1955    
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966     redo A;
1967 wakaba 1.1 } elsif ($self->{nc} == -1) {
1968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972 wakaba 1.15
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     ## reconsume
1976     return ($self->{ct}); # start tag
1977     redo A;
1978 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980     if ($self->{ct}->{attributes}) {
1981    
1982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983     } else {
1984     ## NOTE: This state should never be reached.
1985    
1986     }
1987 wakaba 1.15
1988     $self->{state} = DATA_STATE;
1989     $self->{s_kwd} = '';
1990     ## reconsume
1991     return ($self->{ct}); # end tag
1992     redo A;
1993     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994     ## XML5: No parse error above; not defined yet.
1995     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     ## Reconsume.
1998     return ($self->{ct}); # ATTLIST
1999     redo A;
2000 wakaba 1.1 } else {
2001     die "$0: $self->{ct}->{type}: Unknown token type";
2002     }
2003     } else {
2004 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2005 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006    
2007     ## XML5: Not a parse error.
2008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009     } else {
2010    
2011     }
2012 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2013     $self->{read_until}->($self->{ca}->{value},
2014 wakaba 1.25 qq['&<\x09\x0C\x20],
2015 wakaba 1.1 length $self->{ca}->{value});
2016    
2017     ## Stay in the state
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     }
2031     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2033    
2034 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2035 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036    
2037     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039     } else {
2040    
2041     ## XML5: "Tag attribute name before state".
2042     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043     }
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055     redo A;
2056     } elsif ($self->{nc} == 0x0026) { # &
2057    
2058 wakaba 1.11
2059     ## XML5: Not defined yet.
2060    
2061 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2062     ## "entity in attribute value state". In this implementation, the
2063     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064     ## implementation of the "consume a character reference" algorithm.
2065     $self->{entity_add} = -1;
2066     $self->{prev_state} = $self->{state};
2067     $self->{state} = ENTITY_STATE;
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x003E) { # >
2081     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082    
2083     $self->{last_stag_name} = $self->{ct}->{tag_name};
2084 wakaba 1.15
2085     $self->{state} = DATA_STATE;
2086     $self->{s_kwd} = '';
2087    
2088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098     return ($self->{ct}); # start tag
2099     redo A;
2100 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102     if ($self->{ct}->{attributes}) {
2103    
2104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105     } else {
2106     ## NOTE: This state should never be reached.
2107    
2108     }
2109 wakaba 1.15
2110     $self->{state} = DATA_STATE;
2111     $self->{s_kwd} = '';
2112    
2113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114     $self->{line_prev} = $self->{line};
2115     $self->{column_prev} = $self->{column};
2116     $self->{column}++;
2117     $self->{nc}
2118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119     } else {
2120     $self->{set_nc}->($self);
2121     }
2122    
2123     return ($self->{ct}); # end tag
2124     redo A;
2125     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128    
2129 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130     $self->{line_prev} = $self->{line};
2131     $self->{column_prev} = $self->{column};
2132     $self->{column}++;
2133     $self->{nc}
2134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135     } else {
2136     $self->{set_nc}->($self);
2137     }
2138    
2139 wakaba 1.15 return ($self->{ct}); # ATTLIST
2140     redo A;
2141     } else {
2142     die "$0: $self->{ct}->{type}: Unknown token type";
2143     }
2144 wakaba 1.1 } elsif ($self->{nc} == -1) {
2145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146    
2147 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149 wakaba 1.15
2150     $self->{state} = DATA_STATE;
2151     $self->{s_kwd} = '';
2152     ## reconsume
2153     return ($self->{ct}); # start tag
2154     redo A;
2155 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158     if ($self->{ct}->{attributes}) {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161     } else {
2162     ## NOTE: This state should never be reached.
2163    
2164     }
2165 wakaba 1.15
2166     $self->{state} = DATA_STATE;
2167     $self->{s_kwd} = '';
2168     ## reconsume
2169     return ($self->{ct}); # end tag
2170     redo A;
2171     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175     ## Reconsume.
2176     return ($self->{ct}); # ATTLIST
2177     redo A;
2178 wakaba 1.1 } else {
2179     die "$0: $self->{ct}->{type}: Unknown token type";
2180     }
2181     } else {
2182     if ({
2183     0x0022 => 1, # "
2184     0x0027 => 1, # '
2185     0x003D => 1, # =
2186     }->{$self->{nc}}) {
2187    
2188 wakaba 1.11 ## XML5: Not a parse error.
2189 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2190     } else {
2191    
2192     }
2193     $self->{ca}->{value} .= chr ($self->{nc});
2194     $self->{read_until}->($self->{ca}->{value},
2195 wakaba 1.25 qq["'=& \x09\x0C>],
2196 wakaba 1.1 length $self->{ca}->{value});
2197    
2198     ## Stay in the state
2199    
2200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2201     $self->{line_prev} = $self->{line};
2202     $self->{column_prev} = $self->{column};
2203     $self->{column}++;
2204     $self->{nc}
2205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2206     } else {
2207     $self->{set_nc}->($self);
2208     }
2209    
2210     redo A;
2211     }
2212     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2213     if ($is_space->{$self->{nc}}) {
2214    
2215     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2216    
2217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218     $self->{line_prev} = $self->{line};
2219     $self->{column_prev} = $self->{column};
2220     $self->{column}++;
2221     $self->{nc}
2222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223     } else {
2224     $self->{set_nc}->($self);
2225     }
2226    
2227     redo A;
2228     } elsif ($self->{nc} == 0x003E) { # >
2229     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2230    
2231     $self->{last_stag_name} = $self->{ct}->{tag_name};
2232     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2233     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2234     if ($self->{ct}->{attributes}) {
2235    
2236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2237     } else {
2238     ## NOTE: This state should never be reached.
2239    
2240     }
2241     } else {
2242     die "$0: $self->{ct}->{type}: Unknown token type";
2243     }
2244     $self->{state} = DATA_STATE;
2245 wakaba 1.5 $self->{s_kwd} = '';
2246 wakaba 1.1
2247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2248     $self->{line_prev} = $self->{line};
2249     $self->{column_prev} = $self->{column};
2250     $self->{column}++;
2251     $self->{nc}
2252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2253     } else {
2254     $self->{set_nc}->($self);
2255     }
2256    
2257    
2258     return ($self->{ct}); # start tag or end tag
2259    
2260     redo A;
2261     } elsif ($self->{nc} == 0x002F) { # /
2262    
2263     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2264    
2265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2266     $self->{line_prev} = $self->{line};
2267     $self->{column_prev} = $self->{column};
2268     $self->{column}++;
2269     $self->{nc}
2270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2271     } else {
2272     $self->{set_nc}->($self);
2273     }
2274    
2275     redo A;
2276     } elsif ($self->{nc} == -1) {
2277     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2278     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2279    
2280     $self->{last_stag_name} = $self->{ct}->{tag_name};
2281     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2282     if ($self->{ct}->{attributes}) {
2283    
2284     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2285     } else {
2286     ## NOTE: This state should never be reached.
2287    
2288     }
2289     } else {
2290     die "$0: $self->{ct}->{type}: Unknown token type";
2291     }
2292     $self->{state} = DATA_STATE;
2293 wakaba 1.5 $self->{s_kwd} = '';
2294 wakaba 1.1 ## Reconsume.
2295     return ($self->{ct}); # start tag or end tag
2296     redo A;
2297     } else {
2298    
2299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2300     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2301     ## reconsume
2302     redo A;
2303     }
2304     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2305 wakaba 1.11 ## XML5: "Empty tag state".
2306    
2307 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2308     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2309    
2310     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2311     ## TODO: Different type than slash in start tag
2312     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2313     if ($self->{ct}->{attributes}) {
2314    
2315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2316     } else {
2317    
2318     }
2319     ## TODO: Test |<title></title/>|
2320     } else {
2321    
2322     $self->{self_closing} = 1;
2323     }
2324    
2325     $self->{state} = DATA_STATE;
2326 wakaba 1.5 $self->{s_kwd} = '';
2327 wakaba 1.1
2328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2329     $self->{line_prev} = $self->{line};
2330     $self->{column_prev} = $self->{column};
2331     $self->{column}++;
2332     $self->{nc}
2333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2334     } else {
2335     $self->{set_nc}->($self);
2336     }
2337    
2338    
2339     return ($self->{ct}); # start tag or end tag
2340    
2341     redo A;
2342     } elsif ($self->{nc} == -1) {
2343     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2344     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2345    
2346     $self->{last_stag_name} = $self->{ct}->{tag_name};
2347     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2348     if ($self->{ct}->{attributes}) {
2349    
2350     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2351     } else {
2352     ## NOTE: This state should never be reached.
2353    
2354     }
2355     } else {
2356     die "$0: $self->{ct}->{type}: Unknown token type";
2357     }
2358 wakaba 1.11 ## XML5: "Tag attribute name before state".
2359 wakaba 1.1 $self->{state} = DATA_STATE;
2360 wakaba 1.5 $self->{s_kwd} = '';
2361 wakaba 1.1 ## Reconsume.
2362     return ($self->{ct}); # start tag or end tag
2363     redo A;
2364     } else {
2365    
2366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2367     ## TODO: This error type is wrong.
2368     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2369     ## Reconsume.
2370     redo A;
2371     }
2372     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2373 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2374    
2375 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2376     ## consumes characters one-by-one basis.
2377    
2378     if ($self->{nc} == 0x003E) { # >
2379 wakaba 1.13 if ($self->{in_subset}) {
2380    
2381     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2382     } else {
2383    
2384     $self->{state} = DATA_STATE;
2385     $self->{s_kwd} = '';
2386     }
2387 wakaba 1.1
2388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389     $self->{line_prev} = $self->{line};
2390     $self->{column_prev} = $self->{column};
2391     $self->{column}++;
2392     $self->{nc}
2393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2394     } else {
2395     $self->{set_nc}->($self);
2396     }
2397    
2398    
2399     return ($self->{ct}); # comment
2400     redo A;
2401     } elsif ($self->{nc} == -1) {
2402 wakaba 1.13 if ($self->{in_subset}) {
2403    
2404     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2405     } else {
2406    
2407     $self->{state} = DATA_STATE;
2408     $self->{s_kwd} = '';
2409     }
2410 wakaba 1.1 ## reconsume
2411    
2412     return ($self->{ct}); # comment
2413     redo A;
2414     } else {
2415    
2416     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2417     $self->{read_until}->($self->{ct}->{data},
2418     q[>],
2419     length $self->{ct}->{data});
2420    
2421     ## Stay in the state.
2422    
2423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2424     $self->{line_prev} = $self->{line};
2425     $self->{column_prev} = $self->{column};
2426     $self->{column}++;
2427     $self->{nc}
2428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2429     } else {
2430     $self->{set_nc}->($self);
2431     }
2432    
2433     redo A;
2434     }
2435     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2436 wakaba 1.14 ## XML5: "Markup declaration state".
2437 wakaba 1.1
2438     if ($self->{nc} == 0x002D) { # -
2439    
2440     $self->{state} = MD_HYPHEN_STATE;
2441    
2442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2443     $self->{line_prev} = $self->{line};
2444     $self->{column_prev} = $self->{column};
2445     $self->{column}++;
2446     $self->{nc}
2447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2448     } else {
2449     $self->{set_nc}->($self);
2450     }
2451    
2452     redo A;
2453     } elsif ($self->{nc} == 0x0044 or # D
2454     $self->{nc} == 0x0064) { # d
2455     ## ASCII case-insensitive.
2456    
2457     $self->{state} = MD_DOCTYPE_STATE;
2458 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2459 wakaba 1.1
2460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2461     $self->{line_prev} = $self->{line};
2462     $self->{column_prev} = $self->{column};
2463     $self->{column}++;
2464     $self->{nc}
2465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2466     } else {
2467     $self->{set_nc}->($self);
2468     }
2469    
2470     redo A;
2471 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2472     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2473     $self->{is_xml}) and
2474 wakaba 1.1 $self->{nc} == 0x005B) { # [
2475    
2476     $self->{state} = MD_CDATA_STATE;
2477 wakaba 1.12 $self->{kwd} = '[';
2478 wakaba 1.1
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } else {
2491    
2492     }
2493    
2494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2495     line => $self->{line_prev},
2496     column => $self->{column_prev} - 1);
2497     ## Reconsume.
2498     $self->{state} = BOGUS_COMMENT_STATE;
2499     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2500     line => $self->{line_prev},
2501     column => $self->{column_prev} - 1,
2502     };
2503     redo A;
2504     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2505     if ($self->{nc} == 0x002D) { # -
2506    
2507     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2508     line => $self->{line_prev},
2509     column => $self->{column_prev} - 2,
2510     };
2511 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2512 wakaba 1.1
2513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2514     $self->{line_prev} = $self->{line};
2515     $self->{column_prev} = $self->{column};
2516     $self->{column}++;
2517     $self->{nc}
2518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2519     } else {
2520     $self->{set_nc}->($self);
2521     }
2522    
2523     redo A;
2524     } else {
2525    
2526     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2527     line => $self->{line_prev},
2528     column => $self->{column_prev} - 2);
2529     $self->{state} = BOGUS_COMMENT_STATE;
2530     ## Reconsume.
2531     $self->{ct} = {type => COMMENT_TOKEN,
2532     data => '-',
2533     line => $self->{line_prev},
2534     column => $self->{column_prev} - 2,
2535     };
2536     redo A;
2537     }
2538     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2539     ## ASCII case-insensitive.
2540     if ($self->{nc} == [
2541     undef,
2542     0x004F, # O
2543     0x0043, # C
2544     0x0054, # T
2545     0x0059, # Y
2546     0x0050, # P
2547 wakaba 1.12 ]->[length $self->{kwd}] or
2548 wakaba 1.1 $self->{nc} == [
2549     undef,
2550     0x006F, # o
2551     0x0063, # c
2552     0x0074, # t
2553     0x0079, # y
2554     0x0070, # p
2555 wakaba 1.12 ]->[length $self->{kwd}]) {
2556 wakaba 1.1
2557     ## Stay in the state.
2558 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2559 wakaba 1.1
2560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2561     $self->{line_prev} = $self->{line};
2562     $self->{column_prev} = $self->{column};
2563     $self->{column}++;
2564     $self->{nc}
2565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2566     } else {
2567     $self->{set_nc}->($self);
2568     }
2569    
2570     redo A;
2571 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2572 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2573     $self->{nc} == 0x0065)) { # e
2574 wakaba 1.12 if ($self->{is_xml} and
2575     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2576 wakaba 1.10
2577     ## XML5: case-sensitive.
2578     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2579     text => 'DOCTYPE',
2580     line => $self->{line_prev},
2581     column => $self->{column_prev} - 5);
2582     } else {
2583    
2584     }
2585 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2586     $self->{ct} = {type => DOCTYPE_TOKEN,
2587     quirks => 1,
2588     line => $self->{line_prev},
2589     column => $self->{column_prev} - 7,
2590     };
2591    
2592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2593     $self->{line_prev} = $self->{line};
2594     $self->{column_prev} = $self->{column};
2595     $self->{column}++;
2596     $self->{nc}
2597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2598     } else {
2599     $self->{set_nc}->($self);
2600     }
2601    
2602     redo A;
2603     } else {
2604    
2605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2606     line => $self->{line_prev},
2607 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2608 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2609     ## Reconsume.
2610     $self->{ct} = {type => COMMENT_TOKEN,
2611 wakaba 1.12 data => $self->{kwd},
2612 wakaba 1.1 line => $self->{line_prev},
2613 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2614 wakaba 1.1 };
2615     redo A;
2616     }
2617     } elsif ($self->{state} == MD_CDATA_STATE) {
2618     if ($self->{nc} == {
2619     '[' => 0x0043, # C
2620     '[C' => 0x0044, # D
2621     '[CD' => 0x0041, # A
2622     '[CDA' => 0x0054, # T
2623     '[CDAT' => 0x0041, # A
2624 wakaba 1.12 }->{$self->{kwd}}) {
2625 wakaba 1.1
2626     ## Stay in the state.
2627 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2628 wakaba 1.1
2629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2630     $self->{line_prev} = $self->{line};
2631     $self->{column_prev} = $self->{column};
2632     $self->{column}++;
2633     $self->{nc}
2634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2635     } else {
2636     $self->{set_nc}->($self);
2637     }
2638    
2639     redo A;
2640 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2641 wakaba 1.1 $self->{nc} == 0x005B) { # [
2642 wakaba 1.6 if ($self->{is_xml} and
2643     not $self->{tainted} and
2644     @{$self->{open_elements} or []} == 0) {
2645 wakaba 1.8
2646 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2647     line => $self->{line_prev},
2648     column => $self->{column_prev} - 7);
2649     $self->{tainted} = 1;
2650 wakaba 1.8 } else {
2651    
2652 wakaba 1.6 }
2653    
2654 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2655     data => '',
2656     line => $self->{line_prev},
2657     column => $self->{column_prev} - 7};
2658     $self->{state} = CDATA_SECTION_STATE;
2659    
2660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2661     $self->{line_prev} = $self->{line};
2662     $self->{column_prev} = $self->{column};
2663     $self->{column}++;
2664     $self->{nc}
2665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2666     } else {
2667     $self->{set_nc}->($self);
2668     }
2669    
2670     redo A;
2671     } else {
2672    
2673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2674     line => $self->{line_prev},
2675 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2676 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2677     ## Reconsume.
2678     $self->{ct} = {type => COMMENT_TOKEN,
2679 wakaba 1.12 data => $self->{kwd},
2680 wakaba 1.1 line => $self->{line_prev},
2681 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2682 wakaba 1.1 };
2683     redo A;
2684     }
2685     } elsif ($self->{state} == COMMENT_START_STATE) {
2686     if ($self->{nc} == 0x002D) { # -
2687    
2688     $self->{state} = COMMENT_START_DASH_STATE;
2689    
2690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691     $self->{line_prev} = $self->{line};
2692     $self->{column_prev} = $self->{column};
2693     $self->{column}++;
2694     $self->{nc}
2695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696     } else {
2697     $self->{set_nc}->($self);
2698     }
2699    
2700     redo A;
2701     } elsif ($self->{nc} == 0x003E) { # >
2702     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2703 wakaba 1.13 if ($self->{in_subset}) {
2704    
2705     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2706     } else {
2707    
2708     $self->{state} = DATA_STATE;
2709     $self->{s_kwd} = '';
2710     }
2711 wakaba 1.1
2712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2713     $self->{line_prev} = $self->{line};
2714     $self->{column_prev} = $self->{column};
2715     $self->{column}++;
2716     $self->{nc}
2717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2718     } else {
2719     $self->{set_nc}->($self);
2720     }
2721    
2722    
2723     return ($self->{ct}); # comment
2724    
2725     redo A;
2726     } elsif ($self->{nc} == -1) {
2727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2728 wakaba 1.13 if ($self->{in_subset}) {
2729    
2730     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731     } else {
2732    
2733     $self->{state} = DATA_STATE;
2734     $self->{s_kwd} = '';
2735     }
2736 wakaba 1.1 ## reconsume
2737    
2738     return ($self->{ct}); # comment
2739    
2740     redo A;
2741     } else {
2742    
2743     $self->{ct}->{data} # comment
2744     .= chr ($self->{nc});
2745     $self->{state} = COMMENT_STATE;
2746    
2747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2748     $self->{line_prev} = $self->{line};
2749     $self->{column_prev} = $self->{column};
2750     $self->{column}++;
2751     $self->{nc}
2752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2753     } else {
2754     $self->{set_nc}->($self);
2755     }
2756    
2757     redo A;
2758     }
2759     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2760     if ($self->{nc} == 0x002D) { # -
2761    
2762     $self->{state} = COMMENT_END_STATE;
2763    
2764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765     $self->{line_prev} = $self->{line};
2766     $self->{column_prev} = $self->{column};
2767     $self->{column}++;
2768     $self->{nc}
2769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770     } else {
2771     $self->{set_nc}->($self);
2772     }
2773    
2774     redo A;
2775     } elsif ($self->{nc} == 0x003E) { # >
2776     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2777 wakaba 1.13 if ($self->{in_subset}) {
2778    
2779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780     } else {
2781    
2782     $self->{state} = DATA_STATE;
2783     $self->{s_kwd} = '';
2784     }
2785 wakaba 1.1
2786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2787     $self->{line_prev} = $self->{line};
2788     $self->{column_prev} = $self->{column};
2789     $self->{column}++;
2790     $self->{nc}
2791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2792     } else {
2793     $self->{set_nc}->($self);
2794     }
2795    
2796    
2797     return ($self->{ct}); # comment
2798    
2799     redo A;
2800     } elsif ($self->{nc} == -1) {
2801     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2802 wakaba 1.13 if ($self->{in_subset}) {
2803    
2804     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2805     } else {
2806    
2807     $self->{state} = DATA_STATE;
2808     $self->{s_kwd} = '';
2809     }
2810 wakaba 1.1 ## reconsume
2811    
2812     return ($self->{ct}); # comment
2813    
2814     redo A;
2815     } else {
2816    
2817     $self->{ct}->{data} # comment
2818     .= '-' . chr ($self->{nc});
2819     $self->{state} = COMMENT_STATE;
2820    
2821     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2822     $self->{line_prev} = $self->{line};
2823     $self->{column_prev} = $self->{column};
2824     $self->{column}++;
2825     $self->{nc}
2826     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2827     } else {
2828     $self->{set_nc}->($self);
2829     }
2830    
2831     redo A;
2832     }
2833     } elsif ($self->{state} == COMMENT_STATE) {
2834 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2835    
2836 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2837    
2838     $self->{state} = COMMENT_END_DASH_STATE;
2839    
2840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2841     $self->{line_prev} = $self->{line};
2842     $self->{column_prev} = $self->{column};
2843     $self->{column}++;
2844     $self->{nc}
2845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2846     } else {
2847     $self->{set_nc}->($self);
2848     }
2849    
2850     redo A;
2851     } elsif ($self->{nc} == -1) {
2852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2853 wakaba 1.13 if ($self->{in_subset}) {
2854    
2855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856     } else {
2857    
2858     $self->{state} = DATA_STATE;
2859     $self->{s_kwd} = '';
2860     }
2861 wakaba 1.1 ## reconsume
2862    
2863     return ($self->{ct}); # comment
2864    
2865     redo A;
2866     } else {
2867    
2868     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2869     $self->{read_until}->($self->{ct}->{data},
2870     q[-],
2871     length $self->{ct}->{data});
2872    
2873     ## Stay in the state
2874    
2875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2876     $self->{line_prev} = $self->{line};
2877     $self->{column_prev} = $self->{column};
2878     $self->{column}++;
2879     $self->{nc}
2880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2881     } else {
2882     $self->{set_nc}->($self);
2883     }
2884    
2885     redo A;
2886     }
2887     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2888 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2889 wakaba 1.10
2890 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2891    
2892     $self->{state} = COMMENT_END_STATE;
2893    
2894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2895     $self->{line_prev} = $self->{line};
2896     $self->{column_prev} = $self->{column};
2897     $self->{column}++;
2898     $self->{nc}
2899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2900     } else {
2901     $self->{set_nc}->($self);
2902     }
2903    
2904     redo A;
2905     } elsif ($self->{nc} == -1) {
2906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2907 wakaba 1.13 if ($self->{in_subset}) {
2908    
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910     } else {
2911    
2912     $self->{state} = DATA_STATE;
2913     $self->{s_kwd} = '';
2914     }
2915 wakaba 1.1 ## reconsume
2916    
2917     return ($self->{ct}); # comment
2918    
2919     redo A;
2920     } else {
2921    
2922     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2923     $self->{state} = COMMENT_STATE;
2924    
2925     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2926     $self->{line_prev} = $self->{line};
2927     $self->{column_prev} = $self->{column};
2928     $self->{column}++;
2929     $self->{nc}
2930     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2931     } else {
2932     $self->{set_nc}->($self);
2933     }
2934    
2935     redo A;
2936     }
2937     } elsif ($self->{state} == COMMENT_END_STATE) {
2938 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2939    
2940 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2941 wakaba 1.13 if ($self->{in_subset}) {
2942    
2943     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2944     } else {
2945    
2946     $self->{state} = DATA_STATE;
2947     $self->{s_kwd} = '';
2948     }
2949 wakaba 1.1
2950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2951     $self->{line_prev} = $self->{line};
2952     $self->{column_prev} = $self->{column};
2953     $self->{column}++;
2954     $self->{nc}
2955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2956     } else {
2957     $self->{set_nc}->($self);
2958     }
2959    
2960    
2961     return ($self->{ct}); # comment
2962    
2963     redo A;
2964     } elsif ($self->{nc} == 0x002D) { # -
2965    
2966 wakaba 1.10 ## XML5: Not a parse error.
2967 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2968     line => $self->{line_prev},
2969     column => $self->{column_prev});
2970     $self->{ct}->{data} .= '-'; # comment
2971     ## Stay in the state
2972    
2973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2974     $self->{line_prev} = $self->{line};
2975     $self->{column_prev} = $self->{column};
2976     $self->{column}++;
2977     $self->{nc}
2978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2979     } else {
2980     $self->{set_nc}->($self);
2981     }
2982    
2983     redo A;
2984     } elsif ($self->{nc} == -1) {
2985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2986 wakaba 1.13 if ($self->{in_subset}) {
2987    
2988     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2989     } else {
2990    
2991     $self->{state} = DATA_STATE;
2992     $self->{s_kwd} = '';
2993     }
2994 wakaba 1.1 ## reconsume
2995    
2996     return ($self->{ct}); # comment
2997    
2998     redo A;
2999     } else {
3000    
3001 wakaba 1.10 ## XML5: Not a parse error.
3002 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3003     line => $self->{line_prev},
3004     column => $self->{column_prev});
3005     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3006     $self->{state} = COMMENT_STATE;
3007    
3008     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3009     $self->{line_prev} = $self->{line};
3010     $self->{column_prev} = $self->{column};
3011     $self->{column}++;
3012     $self->{nc}
3013     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3014     } else {
3015     $self->{set_nc}->($self);
3016     }
3017    
3018     redo A;
3019     }
3020     } elsif ($self->{state} == DOCTYPE_STATE) {
3021     if ($is_space->{$self->{nc}}) {
3022    
3023     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3024    
3025     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3026     $self->{line_prev} = $self->{line};
3027     $self->{column_prev} = $self->{column};
3028     $self->{column}++;
3029     $self->{nc}
3030     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3031     } else {
3032     $self->{set_nc}->($self);
3033     }
3034    
3035     redo A;
3036     } else {
3037    
3038 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3039 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3040     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3041     ## reconsume
3042     redo A;
3043     }
3044     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3045 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3046    
3047 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3048    
3049     ## Stay in the state
3050    
3051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3052     $self->{line_prev} = $self->{line};
3053     $self->{column_prev} = $self->{column};
3054     $self->{column}++;
3055     $self->{nc}
3056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3057     } else {
3058     $self->{set_nc}->($self);
3059     }
3060    
3061     redo A;
3062     } elsif ($self->{nc} == 0x003E) { # >
3063    
3064 wakaba 1.12 ## XML5: No parse error.
3065 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3066     $self->{state} = DATA_STATE;
3067 wakaba 1.5 $self->{s_kwd} = '';
3068 wakaba 1.1
3069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3070     $self->{line_prev} = $self->{line};
3071     $self->{column_prev} = $self->{column};
3072     $self->{column}++;
3073     $self->{nc}
3074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3075     } else {
3076     $self->{set_nc}->($self);
3077     }
3078    
3079    
3080     return ($self->{ct}); # DOCTYPE (quirks)
3081    
3082     redo A;
3083     } elsif ($self->{nc} == -1) {
3084    
3085     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3086     $self->{state} = DATA_STATE;
3087 wakaba 1.5 $self->{s_kwd} = '';
3088 wakaba 1.1 ## reconsume
3089    
3090     return ($self->{ct}); # DOCTYPE (quirks)
3091    
3092     redo A;
3093 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3094    
3095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3096     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3097 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3098     $self->{in_subset} = 1;
3099 wakaba 1.12
3100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3101     $self->{line_prev} = $self->{line};
3102     $self->{column_prev} = $self->{column};
3103     $self->{column}++;
3104     $self->{nc}
3105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3106     } else {
3107     $self->{set_nc}->($self);
3108     }
3109    
3110 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3111 wakaba 1.12 redo A;
3112 wakaba 1.1 } else {
3113    
3114     $self->{ct}->{name} = chr $self->{nc};
3115     delete $self->{ct}->{quirks};
3116     $self->{state} = DOCTYPE_NAME_STATE;
3117    
3118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119     $self->{line_prev} = $self->{line};
3120     $self->{column_prev} = $self->{column};
3121     $self->{column}++;
3122     $self->{nc}
3123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3124     } else {
3125     $self->{set_nc}->($self);
3126     }
3127    
3128     redo A;
3129     }
3130     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3131 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3132    
3133     ## ISSUE: Redundant "First," in the spec.
3134    
3135 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3136    
3137     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3138    
3139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3140     $self->{line_prev} = $self->{line};
3141     $self->{column_prev} = $self->{column};
3142     $self->{column}++;
3143     $self->{nc}
3144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3145     } else {
3146     $self->{set_nc}->($self);
3147     }
3148    
3149     redo A;
3150     } elsif ($self->{nc} == 0x003E) { # >
3151    
3152     $self->{state} = DATA_STATE;
3153 wakaba 1.5 $self->{s_kwd} = '';
3154 wakaba 1.1
3155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3156     $self->{line_prev} = $self->{line};
3157     $self->{column_prev} = $self->{column};
3158     $self->{column}++;
3159     $self->{nc}
3160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3161     } else {
3162     $self->{set_nc}->($self);
3163     }
3164    
3165    
3166     return ($self->{ct}); # DOCTYPE
3167    
3168     redo A;
3169     } elsif ($self->{nc} == -1) {
3170    
3171     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3172     $self->{state} = DATA_STATE;
3173 wakaba 1.5 $self->{s_kwd} = '';
3174 wakaba 1.1 ## reconsume
3175    
3176     $self->{ct}->{quirks} = 1;
3177     return ($self->{ct}); # DOCTYPE
3178    
3179     redo A;
3180 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3181    
3182     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3183 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3184     $self->{in_subset} = 1;
3185 wakaba 1.12
3186     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187     $self->{line_prev} = $self->{line};
3188     $self->{column_prev} = $self->{column};
3189     $self->{column}++;
3190     $self->{nc}
3191     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3192     } else {
3193     $self->{set_nc}->($self);
3194     }
3195    
3196 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3197 wakaba 1.12 redo A;
3198 wakaba 1.1 } else {
3199    
3200     $self->{ct}->{name}
3201     .= chr ($self->{nc}); # DOCTYPE
3202     ## Stay in the state
3203    
3204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3205     $self->{line_prev} = $self->{line};
3206     $self->{column_prev} = $self->{column};
3207     $self->{column}++;
3208     $self->{nc}
3209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3210     } else {
3211     $self->{set_nc}->($self);
3212     }
3213    
3214     redo A;
3215     }
3216     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3217 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3218     ## state", but implemented differently.
3219    
3220 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3221    
3222     ## Stay in the state
3223    
3224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225     $self->{line_prev} = $self->{line};
3226     $self->{column_prev} = $self->{column};
3227     $self->{column}++;
3228     $self->{nc}
3229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230     } else {
3231     $self->{set_nc}->($self);
3232     }
3233    
3234     redo A;
3235     } elsif ($self->{nc} == 0x003E) { # >
3236 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3237    
3238     $self->{state} = DATA_STATE;
3239     $self->{s_kwd} = '';
3240     } else {
3241    
3242     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3243     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3244     }
3245 wakaba 1.1
3246    
3247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248     $self->{line_prev} = $self->{line};
3249     $self->{column_prev} = $self->{column};
3250     $self->{column}++;
3251     $self->{nc}
3252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253     } else {
3254     $self->{set_nc}->($self);
3255     }
3256    
3257 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3258 wakaba 1.1 redo A;
3259     } elsif ($self->{nc} == -1) {
3260 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3261    
3262     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3263     $self->{state} = DATA_STATE;
3264     $self->{s_kwd} = '';
3265     $self->{ct}->{quirks} = 1;
3266     } else {
3267    
3268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3269     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3270     }
3271 wakaba 1.1
3272 wakaba 1.16 ## Reconsume.
3273     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3274 wakaba 1.1 redo A;
3275     } elsif ($self->{nc} == 0x0050 or # P
3276     $self->{nc} == 0x0070) { # p
3277 wakaba 1.12
3278 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3279 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3280 wakaba 1.1
3281     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282     $self->{line_prev} = $self->{line};
3283     $self->{column_prev} = $self->{column};
3284     $self->{column}++;
3285     $self->{nc}
3286     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3287     } else {
3288     $self->{set_nc}->($self);
3289     }
3290    
3291     redo A;
3292     } elsif ($self->{nc} == 0x0053 or # S
3293     $self->{nc} == 0x0073) { # s
3294 wakaba 1.12
3295 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3296 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3297    
3298     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3299     $self->{line_prev} = $self->{line};
3300     $self->{column_prev} = $self->{column};
3301     $self->{column}++;
3302     $self->{nc}
3303     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3304     } else {
3305     $self->{set_nc}->($self);
3306     }
3307    
3308     redo A;
3309 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3310     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3311     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3312    
3313     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3314     $self->{ct}->{value} = ''; # ENTITY
3315    
3316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3317     $self->{line_prev} = $self->{line};
3318     $self->{column_prev} = $self->{column};
3319     $self->{column}++;
3320     $self->{nc}
3321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3322     } else {
3323     $self->{set_nc}->($self);
3324     }
3325    
3326     redo A;
3327     } elsif ($self->{nc} == 0x0027 and # '
3328     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3329     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3330    
3331     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3332     $self->{ct}->{value} = ''; # ENTITY
3333    
3334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3335     $self->{line_prev} = $self->{line};
3336     $self->{column_prev} = $self->{column};
3337     $self->{column}++;
3338     $self->{nc}
3339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3340     } else {
3341     $self->{set_nc}->($self);
3342     }
3343    
3344     redo A;
3345 wakaba 1.16 } elsif ($self->{is_xml} and
3346     $self->{ct}->{type} == DOCTYPE_TOKEN and
3347     $self->{nc} == 0x005B) { # [
3348 wakaba 1.12
3349     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3350     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3351 wakaba 1.13 $self->{in_subset} = 1;
3352 wakaba 1.1
3353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3354     $self->{line_prev} = $self->{line};
3355     $self->{column_prev} = $self->{column};
3356     $self->{column}++;
3357     $self->{nc}
3358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3359     } else {
3360     $self->{set_nc}->($self);
3361     }
3362    
3363 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3364 wakaba 1.1 redo A;
3365     } else {
3366 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3367    
3368     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3369    
3370     $self->{ct}->{quirks} = 1;
3371     $self->{state} = BOGUS_DOCTYPE_STATE;
3372     } else {
3373    
3374     $self->{state} = BOGUS_MD_STATE;
3375     }
3376 wakaba 1.1
3377    
3378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3379     $self->{line_prev} = $self->{line};
3380     $self->{column_prev} = $self->{column};
3381     $self->{column}++;
3382     $self->{nc}
3383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3384     } else {
3385     $self->{set_nc}->($self);
3386     }
3387    
3388     redo A;
3389     }
3390     } elsif ($self->{state} == PUBLIC_STATE) {
3391     ## ASCII case-insensitive
3392     if ($self->{nc} == [
3393     undef,
3394     0x0055, # U
3395     0x0042, # B
3396     0x004C, # L
3397     0x0049, # I
3398 wakaba 1.12 ]->[length $self->{kwd}] or
3399 wakaba 1.1 $self->{nc} == [
3400     undef,
3401     0x0075, # u
3402     0x0062, # b
3403     0x006C, # l
3404     0x0069, # i
3405 wakaba 1.12 ]->[length $self->{kwd}]) {
3406 wakaba 1.1
3407     ## Stay in the state.
3408 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3409 wakaba 1.1
3410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3411     $self->{line_prev} = $self->{line};
3412     $self->{column_prev} = $self->{column};
3413     $self->{column}++;
3414     $self->{nc}
3415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3416     } else {
3417     $self->{set_nc}->($self);
3418     }
3419    
3420     redo A;
3421 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3422 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3423     $self->{nc} == 0x0063)) { # c
3424 wakaba 1.12 if ($self->{is_xml} and
3425     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3426    
3427     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3428     text => 'PUBLIC',
3429     line => $self->{line_prev},
3430     column => $self->{column_prev} - 4);
3431     } else {
3432    
3433     }
3434 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3435    
3436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3437     $self->{line_prev} = $self->{line};
3438     $self->{column_prev} = $self->{column};
3439     $self->{column}++;
3440     $self->{nc}
3441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3442     } else {
3443     $self->{set_nc}->($self);
3444     }
3445    
3446     redo A;
3447     } else {
3448 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3449 wakaba 1.1 line => $self->{line_prev},
3450 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3451 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3452    
3453     $self->{ct}->{quirks} = 1;
3454     $self->{state} = BOGUS_DOCTYPE_STATE;
3455     } else {
3456    
3457     $self->{state} = BOGUS_MD_STATE;
3458     }
3459 wakaba 1.1 ## Reconsume.
3460     redo A;
3461     }
3462     } elsif ($self->{state} == SYSTEM_STATE) {
3463     ## ASCII case-insensitive
3464     if ($self->{nc} == [
3465     undef,
3466     0x0059, # Y
3467     0x0053, # S
3468     0x0054, # T
3469     0x0045, # E
3470 wakaba 1.12 ]->[length $self->{kwd}] or
3471 wakaba 1.1 $self->{nc} == [
3472     undef,
3473     0x0079, # y
3474     0x0073, # s
3475     0x0074, # t
3476     0x0065, # e
3477 wakaba 1.12 ]->[length $self->{kwd}]) {
3478 wakaba 1.1
3479     ## Stay in the state.
3480 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3481 wakaba 1.1
3482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3483     $self->{line_prev} = $self->{line};
3484     $self->{column_prev} = $self->{column};
3485     $self->{column}++;
3486     $self->{nc}
3487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3488     } else {
3489     $self->{set_nc}->($self);
3490     }
3491    
3492     redo A;
3493 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3494 wakaba 1.1 ($self->{nc} == 0x004D or # M
3495     $self->{nc} == 0x006D)) { # m
3496 wakaba 1.12 if ($self->{is_xml} and
3497     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3498    
3499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3500     text => 'SYSTEM',
3501     line => $self->{line_prev},
3502     column => $self->{column_prev} - 4);
3503     } else {
3504    
3505     }
3506 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3507    
3508     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3509     $self->{line_prev} = $self->{line};
3510     $self->{column_prev} = $self->{column};
3511     $self->{column}++;
3512     $self->{nc}
3513     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3514     } else {
3515     $self->{set_nc}->($self);
3516     }
3517    
3518     redo A;
3519     } else {
3520 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3521 wakaba 1.1 line => $self->{line_prev},
3522 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3523 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3524    
3525     $self->{ct}->{quirks} = 1;
3526     $self->{state} = BOGUS_DOCTYPE_STATE;
3527     } else {
3528    
3529     $self->{state} = BOGUS_MD_STATE;
3530     }
3531 wakaba 1.1 ## Reconsume.
3532     redo A;
3533     }
3534     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3535     if ($is_space->{$self->{nc}}) {
3536    
3537     ## Stay in the state
3538    
3539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3540     $self->{line_prev} = $self->{line};
3541     $self->{column_prev} = $self->{column};
3542     $self->{column}++;
3543     $self->{nc}
3544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3545     } else {
3546     $self->{set_nc}->($self);
3547     }
3548    
3549     redo A;
3550     } elsif ($self->{nc} eq 0x0022) { # "
3551    
3552     $self->{ct}->{pubid} = ''; # DOCTYPE
3553     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3554    
3555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3556     $self->{line_prev} = $self->{line};
3557     $self->{column_prev} = $self->{column};
3558     $self->{column}++;
3559     $self->{nc}
3560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3561     } else {
3562     $self->{set_nc}->($self);
3563     }
3564    
3565     redo A;
3566     } elsif ($self->{nc} eq 0x0027) { # '
3567    
3568     $self->{ct}->{pubid} = ''; # DOCTYPE
3569     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3570    
3571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3572     $self->{line_prev} = $self->{line};
3573     $self->{column_prev} = $self->{column};
3574     $self->{column}++;
3575     $self->{nc}
3576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3577     } else {
3578     $self->{set_nc}->($self);
3579     }
3580    
3581     redo A;
3582     } elsif ($self->{nc} eq 0x003E) { # >
3583 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3584    
3585     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3586    
3587     $self->{state} = DATA_STATE;
3588     $self->{s_kwd} = '';
3589     $self->{ct}->{quirks} = 1;
3590     } else {
3591    
3592     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3593     }
3594 wakaba 1.1
3595    
3596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597     $self->{line_prev} = $self->{line};
3598     $self->{column_prev} = $self->{column};
3599     $self->{column}++;
3600     $self->{nc}
3601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602     } else {
3603     $self->{set_nc}->($self);
3604     }
3605    
3606 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3607 wakaba 1.1 redo A;
3608     } elsif ($self->{nc} == -1) {
3609 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3610    
3611     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3612     $self->{state} = DATA_STATE;
3613     $self->{s_kwd} = '';
3614     $self->{ct}->{quirks} = 1;
3615     } else {
3616    
3617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3618     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3619     }
3620 wakaba 1.1
3621     ## reconsume
3622     return ($self->{ct}); # DOCTYPE
3623     redo A;
3624 wakaba 1.16 } elsif ($self->{is_xml} and
3625     $self->{ct}->{type} == DOCTYPE_TOKEN and
3626     $self->{nc} == 0x005B) { # [
3627 wakaba 1.12
3628     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3629     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3630     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3631 wakaba 1.13 $self->{in_subset} = 1;
3632 wakaba 1.12
3633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634     $self->{line_prev} = $self->{line};
3635     $self->{column_prev} = $self->{column};
3636     $self->{column}++;
3637     $self->{nc}
3638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639     } else {
3640     $self->{set_nc}->($self);
3641     }
3642    
3643 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3644 wakaba 1.12 redo A;
3645 wakaba 1.1 } else {
3646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3647    
3648 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3649    
3650     $self->{ct}->{quirks} = 1;
3651     $self->{state} = BOGUS_DOCTYPE_STATE;
3652     } else {
3653    
3654     $self->{state} = BOGUS_MD_STATE;
3655     }
3656    
3657 wakaba 1.1
3658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3659     $self->{line_prev} = $self->{line};
3660     $self->{column_prev} = $self->{column};
3661     $self->{column}++;
3662     $self->{nc}
3663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3664     } else {
3665     $self->{set_nc}->($self);
3666     }
3667    
3668     redo A;
3669     }
3670     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3671     if ($self->{nc} == 0x0022) { # "
3672    
3673     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3674    
3675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3676     $self->{line_prev} = $self->{line};
3677     $self->{column_prev} = $self->{column};
3678     $self->{column}++;
3679     $self->{nc}
3680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3681     } else {
3682     $self->{set_nc}->($self);
3683     }
3684    
3685     redo A;
3686     } elsif ($self->{nc} == 0x003E) { # >
3687     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3688    
3689 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3690    
3691     $self->{state} = DATA_STATE;
3692     $self->{s_kwd} = '';
3693     $self->{ct}->{quirks} = 1;
3694     } else {
3695    
3696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3697     }
3698    
3699 wakaba 1.1
3700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3701     $self->{line_prev} = $self->{line};
3702     $self->{column_prev} = $self->{column};
3703     $self->{column}++;
3704     $self->{nc}
3705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3706     } else {
3707     $self->{set_nc}->($self);
3708     }
3709    
3710 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3711 wakaba 1.1 redo A;
3712     } elsif ($self->{nc} == -1) {
3713     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3714    
3715 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3716    
3717     $self->{state} = DATA_STATE;
3718     $self->{s_kwd} = '';
3719     $self->{ct}->{quirks} = 1;
3720     } else {
3721    
3722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3723     }
3724    
3725     ## Reconsume.
3726 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3727     redo A;
3728     } else {
3729    
3730 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3731 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3732     length $self->{ct}->{pubid});
3733    
3734     ## Stay in the state
3735    
3736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3737     $self->{line_prev} = $self->{line};
3738     $self->{column_prev} = $self->{column};
3739     $self->{column}++;
3740     $self->{nc}
3741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3742     } else {
3743     $self->{set_nc}->($self);
3744     }
3745    
3746     redo A;
3747     }
3748     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3749     if ($self->{nc} == 0x0027) { # '
3750    
3751     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3752    
3753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3754     $self->{line_prev} = $self->{line};
3755     $self->{column_prev} = $self->{column};
3756     $self->{column}++;
3757     $self->{nc}
3758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3759     } else {
3760     $self->{set_nc}->($self);
3761     }
3762    
3763     redo A;
3764     } elsif ($self->{nc} == 0x003E) { # >
3765     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766    
3767 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768    
3769     $self->{state} = DATA_STATE;
3770     $self->{s_kwd} = '';
3771     $self->{ct}->{quirks} = 1;
3772     } else {
3773    
3774     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3775     }
3776    
3777 wakaba 1.1
3778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779     $self->{line_prev} = $self->{line};
3780     $self->{column_prev} = $self->{column};
3781     $self->{column}++;
3782     $self->{nc}
3783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784     } else {
3785     $self->{set_nc}->($self);
3786     }
3787    
3788 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3789 wakaba 1.1 redo A;
3790     } elsif ($self->{nc} == -1) {
3791     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3792    
3793 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794    
3795     $self->{state} = DATA_STATE;
3796     $self->{s_kwd} = '';
3797     $self->{ct}->{quirks} = 1;
3798     } else {
3799    
3800     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801     }
3802    
3803 wakaba 1.1 ## reconsume
3804 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3805 wakaba 1.1 redo A;
3806     } else {
3807    
3808 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3809 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3810     length $self->{ct}->{pubid});
3811    
3812     ## Stay in the state
3813    
3814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3815     $self->{line_prev} = $self->{line};
3816     $self->{column_prev} = $self->{column};
3817     $self->{column}++;
3818     $self->{nc}
3819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3820     } else {
3821     $self->{set_nc}->($self);
3822     }
3823    
3824     redo A;
3825     }
3826     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3827     if ($is_space->{$self->{nc}}) {
3828    
3829     ## Stay in the state
3830    
3831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832     $self->{line_prev} = $self->{line};
3833     $self->{column_prev} = $self->{column};
3834     $self->{column}++;
3835     $self->{nc}
3836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837     } else {
3838     $self->{set_nc}->($self);
3839     }
3840    
3841     redo A;
3842     } elsif ($self->{nc} == 0x0022) { # "
3843    
3844 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3845 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3846    
3847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3848     $self->{line_prev} = $self->{line};
3849     $self->{column_prev} = $self->{column};
3850     $self->{column}++;
3851     $self->{nc}
3852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3853     } else {
3854     $self->{set_nc}->($self);
3855     }
3856    
3857     redo A;
3858     } elsif ($self->{nc} == 0x0027) { # '
3859    
3860 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3861 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3862    
3863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864     $self->{line_prev} = $self->{line};
3865     $self->{column_prev} = $self->{column};
3866     $self->{column}++;
3867     $self->{nc}
3868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869     } else {
3870     $self->{set_nc}->($self);
3871     }
3872    
3873     redo A;
3874     } elsif ($self->{nc} == 0x003E) { # >
3875 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3876     if ($self->{is_xml}) {
3877    
3878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3879     } else {
3880    
3881     }
3882     $self->{state} = DATA_STATE;
3883     $self->{s_kwd} = '';
3884 wakaba 1.12 } else {
3885 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3886    
3887     } else {
3888    
3889     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3890     }
3891     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3892 wakaba 1.12 }
3893 wakaba 1.16
3894 wakaba 1.1
3895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896     $self->{line_prev} = $self->{line};
3897     $self->{column_prev} = $self->{column};
3898     $self->{column}++;
3899     $self->{nc}
3900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3901     } else {
3902     $self->{set_nc}->($self);
3903     }
3904    
3905 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3906 wakaba 1.1 redo A;
3907     } elsif ($self->{nc} == -1) {
3908 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3909    
3910     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3911    
3912     $self->{state} = DATA_STATE;
3913     $self->{s_kwd} = '';
3914     $self->{ct}->{quirks} = 1;
3915     } else {
3916     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3917     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3918     }
3919 wakaba 1.1
3920     ## reconsume
3921 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3922 wakaba 1.1 redo A;
3923 wakaba 1.16 } elsif ($self->{is_xml} and
3924     $self->{ct}->{type} == DOCTYPE_TOKEN and
3925     $self->{nc} == 0x005B) { # [
3926 wakaba 1.12
3927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3929     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3930 wakaba 1.13 $self->{in_subset} = 1;
3931 wakaba 1.12
3932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933     $self->{line_prev} = $self->{line};
3934     $self->{column_prev} = $self->{column};
3935     $self->{column}++;
3936     $self->{nc}
3937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938     } else {
3939     $self->{set_nc}->($self);
3940     }
3941    
3942 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3943 wakaba 1.12 redo A;
3944 wakaba 1.1 } else {
3945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3946    
3947 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3948    
3949     $self->{ct}->{quirks} = 1;
3950     $self->{state} = BOGUS_DOCTYPE_STATE;
3951     } else {
3952    
3953     $self->{state} = BOGUS_MD_STATE;
3954     }
3955    
3956 wakaba 1.1
3957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3958     $self->{line_prev} = $self->{line};
3959     $self->{column_prev} = $self->{column};
3960     $self->{column}++;
3961     $self->{nc}
3962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3963     } else {
3964     $self->{set_nc}->($self);
3965     }
3966    
3967     redo A;
3968     }
3969     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3970     if ($is_space->{$self->{nc}}) {
3971    
3972     ## Stay in the state
3973    
3974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3975     $self->{line_prev} = $self->{line};
3976     $self->{column_prev} = $self->{column};
3977     $self->{column}++;
3978     $self->{nc}
3979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3980     } else {
3981     $self->{set_nc}->($self);
3982     }
3983    
3984     redo A;
3985     } elsif ($self->{nc} == 0x0022) { # "
3986    
3987     $self->{ct}->{sysid} = ''; # DOCTYPE
3988     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3989    
3990     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3991     $self->{line_prev} = $self->{line};
3992     $self->{column_prev} = $self->{column};
3993     $self->{column}++;
3994     $self->{nc}
3995     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3996     } else {
3997     $self->{set_nc}->($self);
3998     }
3999    
4000     redo A;
4001     } elsif ($self->{nc} == 0x0027) { # '
4002    
4003     $self->{ct}->{sysid} = ''; # DOCTYPE
4004     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4005    
4006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4007     $self->{line_prev} = $self->{line};
4008     $self->{column_prev} = $self->{column};
4009     $self->{column}++;
4010     $self->{nc}
4011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4012     } else {
4013     $self->{set_nc}->($self);
4014     }
4015    
4016     redo A;
4017     } elsif ($self->{nc} == 0x003E) { # >
4018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4019    
4020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021     $self->{line_prev} = $self->{line};
4022     $self->{column_prev} = $self->{column};
4023     $self->{column}++;
4024     $self->{nc}
4025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026     } else {
4027     $self->{set_nc}->($self);
4028     }
4029    
4030    
4031 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4032    
4033     $self->{state} = DATA_STATE;
4034     $self->{s_kwd} = '';
4035     $self->{ct}->{quirks} = 1;
4036     } else {
4037    
4038     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4039     }
4040 wakaba 1.1
4041 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4042 wakaba 1.1 redo A;
4043     } elsif ($self->{nc} == -1) {
4044 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4045    
4046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4047     $self->{state} = DATA_STATE;
4048     $self->{s_kwd} = '';
4049     $self->{ct}->{quirks} = 1;
4050     } else {
4051    
4052     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054     }
4055 wakaba 1.1
4056     ## reconsume
4057 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4058 wakaba 1.1 redo A;
4059 wakaba 1.16 } elsif ($self->{is_xml} and
4060     $self->{ct}->{type} == DOCTYPE_TOKEN and
4061     $self->{nc} == 0x005B) { # [
4062 wakaba 1.12
4063     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4064    
4065     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4067 wakaba 1.13 $self->{in_subset} = 1;
4068 wakaba 1.12
4069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070     $self->{line_prev} = $self->{line};
4071     $self->{column_prev} = $self->{column};
4072     $self->{column}++;
4073     $self->{nc}
4074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075     } else {
4076     $self->{set_nc}->($self);
4077     }
4078    
4079 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4080 wakaba 1.12 redo A;
4081 wakaba 1.1 } else {
4082     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4083    
4084 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085    
4086     $self->{ct}->{quirks} = 1;
4087     $self->{state} = BOGUS_DOCTYPE_STATE;
4088     } else {
4089    
4090     $self->{state} = BOGUS_MD_STATE;
4091     }
4092    
4093 wakaba 1.1
4094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4095     $self->{line_prev} = $self->{line};
4096     $self->{column_prev} = $self->{column};
4097     $self->{column}++;
4098     $self->{nc}
4099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4100     } else {
4101     $self->{set_nc}->($self);
4102     }
4103    
4104     redo A;
4105     }
4106     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4107     if ($self->{nc} == 0x0022) { # "
4108    
4109     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4110    
4111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4112     $self->{line_prev} = $self->{line};
4113     $self->{column_prev} = $self->{column};
4114     $self->{column}++;
4115     $self->{nc}
4116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4117     } else {
4118     $self->{set_nc}->($self);
4119     }
4120    
4121     redo A;
4122 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4123 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124    
4125 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4126    
4127     $self->{state} = DATA_STATE;
4128     $self->{s_kwd} = '';
4129     $self->{ct}->{quirks} = 1;
4130     } else {
4131    
4132     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4133     }
4134    
4135 wakaba 1.1
4136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4137     $self->{line_prev} = $self->{line};
4138     $self->{column_prev} = $self->{column};
4139     $self->{column}++;
4140     $self->{nc}
4141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4142     } else {
4143     $self->{set_nc}->($self);
4144     }
4145    
4146 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4147 wakaba 1.1 redo A;
4148     } elsif ($self->{nc} == -1) {
4149     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4150    
4151 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4152    
4153     $self->{state} = DATA_STATE;
4154     $self->{s_kwd} = '';
4155     $self->{ct}->{quirks} = 1;
4156     } else {
4157    
4158     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4159     }
4160    
4161 wakaba 1.1 ## reconsume
4162 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4163 wakaba 1.1 redo A;
4164     } else {
4165    
4166 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4167 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4168     length $self->{ct}->{sysid});
4169    
4170     ## Stay in the state
4171    
4172     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173     $self->{line_prev} = $self->{line};
4174     $self->{column_prev} = $self->{column};
4175     $self->{column}++;
4176     $self->{nc}
4177     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178     } else {
4179     $self->{set_nc}->($self);
4180     }
4181    
4182     redo A;
4183     }
4184     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4185     if ($self->{nc} == 0x0027) { # '
4186    
4187     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4188    
4189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190     $self->{line_prev} = $self->{line};
4191     $self->{column_prev} = $self->{column};
4192     $self->{column}++;
4193     $self->{nc}
4194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195     } else {
4196     $self->{set_nc}->($self);
4197     }
4198    
4199     redo A;
4200 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4201 wakaba 1.1
4202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4203    
4204     $self->{state} = DATA_STATE;
4205 wakaba 1.5 $self->{s_kwd} = '';
4206 wakaba 1.1
4207     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4208     $self->{line_prev} = $self->{line};
4209     $self->{column_prev} = $self->{column};
4210     $self->{column}++;
4211     $self->{nc}
4212     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4213     } else {
4214     $self->{set_nc}->($self);
4215     }
4216    
4217    
4218     $self->{ct}->{quirks} = 1;
4219     return ($self->{ct}); # DOCTYPE
4220    
4221     redo A;
4222     } elsif ($self->{nc} == -1) {
4223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4224    
4225 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4226    
4227     $self->{state} = DATA_STATE;
4228     $self->{s_kwd} = '';
4229     $self->{ct}->{quirks} = 1;
4230     } else {
4231    
4232     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4233     }
4234    
4235 wakaba 1.1 ## reconsume
4236 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4237 wakaba 1.1 redo A;
4238     } else {
4239    
4240 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4241 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4242     length $self->{ct}->{sysid});
4243    
4244     ## Stay in the state
4245    
4246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4247     $self->{line_prev} = $self->{line};
4248     $self->{column_prev} = $self->{column};
4249     $self->{column}++;
4250     $self->{nc}
4251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4252     } else {
4253     $self->{set_nc}->($self);
4254     }
4255    
4256     redo A;
4257     }
4258     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4259     if ($is_space->{$self->{nc}}) {
4260 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4261    
4262     $self->{state} = BEFORE_NDATA_STATE;
4263     } else {
4264    
4265     ## Stay in the state
4266     }
4267 wakaba 1.1
4268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4269     $self->{line_prev} = $self->{line};
4270     $self->{column_prev} = $self->{column};
4271     $self->{column}++;
4272     $self->{nc}
4273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4274     } else {
4275     $self->{set_nc}->($self);
4276     }
4277    
4278     redo A;
4279     } elsif ($self->{nc} == 0x003E) { # >
4280 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4281    
4282     $self->{state} = DATA_STATE;
4283     $self->{s_kwd} = '';
4284     } else {
4285    
4286     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4287     }
4288    
4289 wakaba 1.1
4290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4291     $self->{line_prev} = $self->{line};
4292     $self->{column_prev} = $self->{column};
4293     $self->{column}++;
4294     $self->{nc}
4295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4296     } else {
4297     $self->{set_nc}->($self);
4298     }
4299    
4300 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4301 wakaba 1.1 redo A;
4302 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4303     ($self->{nc} == 0x004E or # N
4304     $self->{nc} == 0x006E)) { # n
4305    
4306     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4307     $self->{state} = NDATA_STATE;
4308     $self->{kwd} = chr $self->{nc};
4309    
4310     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4311     $self->{line_prev} = $self->{line};
4312     $self->{column_prev} = $self->{column};
4313     $self->{column}++;
4314     $self->{nc}
4315     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4316     } else {
4317     $self->{set_nc}->($self);
4318     }
4319    
4320     redo A;
4321 wakaba 1.1 } elsif ($self->{nc} == -1) {
4322 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4323    
4324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4325     $self->{state} = DATA_STATE;
4326     $self->{s_kwd} = '';
4327     $self->{ct}->{quirks} = 1;
4328     } else {
4329    
4330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332     }
4333    
4334 wakaba 1.1 ## reconsume
4335 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4336 wakaba 1.1 redo A;
4337 wakaba 1.16 } elsif ($self->{is_xml} and
4338     $self->{ct}->{type} == DOCTYPE_TOKEN and
4339     $self->{nc} == 0x005B) { # [
4340 wakaba 1.12
4341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4343 wakaba 1.13 $self->{in_subset} = 1;
4344 wakaba 1.12
4345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346     $self->{line_prev} = $self->{line};
4347     $self->{column_prev} = $self->{column};
4348     $self->{column}++;
4349     $self->{nc}
4350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351     } else {
4352     $self->{set_nc}->($self);
4353     }
4354    
4355 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4356 wakaba 1.12 redo A;
4357 wakaba 1.1 } else {
4358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4359    
4360 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361    
4362     #$self->{ct}->{quirks} = 1;
4363     $self->{state} = BOGUS_DOCTYPE_STATE;
4364     } else {
4365    
4366     $self->{state} = BOGUS_MD_STATE;
4367     }
4368    
4369 wakaba 1.1
4370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4371     $self->{line_prev} = $self->{line};
4372     $self->{column_prev} = $self->{column};
4373     $self->{column}++;
4374     $self->{nc}
4375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4376     } else {
4377     $self->{set_nc}->($self);
4378     }
4379    
4380     redo A;
4381     }
4382 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4383     if ($is_space->{$self->{nc}}) {
4384    
4385     ## Stay in the state.
4386    
4387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4388     $self->{line_prev} = $self->{line};
4389     $self->{column_prev} = $self->{column};
4390     $self->{column}++;
4391     $self->{nc}
4392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4393     } else {
4394     $self->{set_nc}->($self);
4395     }
4396    
4397     redo A;
4398     } elsif ($self->{nc} == 0x003E) { # >
4399    
4400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401    
4402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403     $self->{line_prev} = $self->{line};
4404     $self->{column_prev} = $self->{column};
4405     $self->{column}++;
4406     $self->{nc}
4407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408     } else {
4409     $self->{set_nc}->($self);
4410     }
4411    
4412     return ($self->{ct}); # ENTITY
4413     redo A;
4414     } elsif ($self->{nc} == 0x004E or # N
4415     $self->{nc} == 0x006E) { # n
4416    
4417     $self->{state} = NDATA_STATE;
4418     $self->{kwd} = chr $self->{nc};
4419    
4420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421     $self->{line_prev} = $self->{line};
4422     $self->{column_prev} = $self->{column};
4423     $self->{column}++;
4424     $self->{nc}
4425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426     } else {
4427     $self->{set_nc}->($self);
4428     }
4429    
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432    
4433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4435     ## reconsume
4436     return ($self->{ct}); # ENTITY
4437     redo A;
4438     } else {
4439    
4440     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4441     $self->{state} = BOGUS_MD_STATE;
4442    
4443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4444     $self->{line_prev} = $self->{line};
4445     $self->{column_prev} = $self->{column};
4446     $self->{column}++;
4447     $self->{nc}
4448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4449     } else {
4450     $self->{set_nc}->($self);
4451     }
4452    
4453     redo A;
4454     }
4455 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4456     if ($self->{nc} == 0x003E) { # >
4457    
4458     $self->{state} = DATA_STATE;
4459 wakaba 1.5 $self->{s_kwd} = '';
4460 wakaba 1.1
4461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4462     $self->{line_prev} = $self->{line};
4463     $self->{column_prev} = $self->{column};
4464     $self->{column}++;
4465     $self->{nc}
4466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4467     } else {
4468     $self->{set_nc}->($self);
4469     }
4470    
4471    
4472     return ($self->{ct}); # DOCTYPE
4473    
4474     redo A;
4475 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4476 wakaba 1.13
4477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4478     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4479     $self->{in_subset} = 1;
4480    
4481 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4482     $self->{line_prev} = $self->{line};
4483     $self->{column_prev} = $self->{column};
4484     $self->{column}++;
4485     $self->{nc}
4486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4487     } else {
4488     $self->{set_nc}->($self);
4489     }
4490    
4491 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4492     redo A;
4493 wakaba 1.1 } elsif ($self->{nc} == -1) {
4494    
4495     $self->{state} = DATA_STATE;
4496 wakaba 1.5 $self->{s_kwd} = '';
4497 wakaba 1.1 ## reconsume
4498    
4499     return ($self->{ct}); # DOCTYPE
4500    
4501     redo A;
4502     } else {
4503    
4504     my $s = '';
4505 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4506 wakaba 1.1
4507     ## Stay in the state
4508    
4509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4510     $self->{line_prev} = $self->{line};
4511     $self->{column_prev} = $self->{column};
4512     $self->{column}++;
4513     $self->{nc}
4514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4515     } else {
4516     $self->{set_nc}->($self);
4517     }
4518    
4519     redo A;
4520     }
4521     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4522     ## NOTE: "CDATA section state" in the state is jointly implemented
4523     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4524     ## and |CDATA_SECTION_MSE2_STATE|.
4525 wakaba 1.10
4526     ## XML5: "CDATA state".
4527 wakaba 1.1
4528     if ($self->{nc} == 0x005D) { # ]
4529    
4530     $self->{state} = CDATA_SECTION_MSE1_STATE;
4531    
4532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4533     $self->{line_prev} = $self->{line};
4534     $self->{column_prev} = $self->{column};
4535     $self->{column}++;
4536     $self->{nc}
4537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4538     } else {
4539     $self->{set_nc}->($self);
4540     }
4541    
4542     redo A;
4543     } elsif ($self->{nc} == -1) {
4544 wakaba 1.6 if ($self->{is_xml}) {
4545 wakaba 1.8
4546 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4547 wakaba 1.8 } else {
4548    
4549 wakaba 1.6 }
4550    
4551 wakaba 1.1 $self->{state} = DATA_STATE;
4552 wakaba 1.5 $self->{s_kwd} = '';
4553 wakaba 1.10 ## Reconsume.
4554 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4555    
4556     return ($self->{ct}); # character
4557     } else {
4558    
4559     ## No token to emit. $self->{ct} is discarded.
4560     }
4561     redo A;
4562     } else {
4563    
4564     $self->{ct}->{data} .= chr $self->{nc};
4565     $self->{read_until}->($self->{ct}->{data},
4566     q<]>,
4567     length $self->{ct}->{data});
4568    
4569     ## Stay in the state.
4570    
4571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4572     $self->{line_prev} = $self->{line};
4573     $self->{column_prev} = $self->{column};
4574     $self->{column}++;
4575     $self->{nc}
4576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4577     } else {
4578     $self->{set_nc}->($self);
4579     }
4580    
4581     redo A;
4582     }
4583    
4584     ## ISSUE: "text tokens" in spec.
4585     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4586 wakaba 1.10 ## XML5: "CDATA bracket state".
4587    
4588 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4589    
4590     $self->{state} = CDATA_SECTION_MSE2_STATE;
4591    
4592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4593     $self->{line_prev} = $self->{line};
4594     $self->{column_prev} = $self->{column};
4595     $self->{column}++;
4596     $self->{nc}
4597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4598     } else {
4599     $self->{set_nc}->($self);
4600     }
4601    
4602     redo A;
4603     } else {
4604    
4605 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4606 wakaba 1.1 $self->{ct}->{data} .= ']';
4607 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4608 wakaba 1.1 ## Reconsume.
4609     redo A;
4610     }
4611     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4612 wakaba 1.10 ## XML5: "CDATA end state".
4613    
4614 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4615     $self->{state} = DATA_STATE;
4616 wakaba 1.5 $self->{s_kwd} = '';
4617 wakaba 1.1
4618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4619     $self->{line_prev} = $self->{line};
4620     $self->{column_prev} = $self->{column};
4621     $self->{column}++;
4622     $self->{nc}
4623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4624     } else {
4625     $self->{set_nc}->($self);
4626     }
4627    
4628     if (length $self->{ct}->{data}) { # character
4629    
4630     return ($self->{ct}); # character
4631     } else {
4632    
4633     ## No token to emit. $self->{ct} is discarded.
4634     }
4635     redo A;
4636     } elsif ($self->{nc} == 0x005D) { # ]
4637     # character
4638     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4639     ## Stay in the state.
4640    
4641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642     $self->{line_prev} = $self->{line};
4643     $self->{column_prev} = $self->{column};
4644     $self->{column}++;
4645     $self->{nc}
4646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647     } else {
4648     $self->{set_nc}->($self);
4649     }
4650    
4651     redo A;
4652     } else {
4653    
4654     $self->{ct}->{data} .= ']]'; # character
4655     $self->{state} = CDATA_SECTION_STATE;
4656 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4657 wakaba 1.1 redo A;
4658     }
4659     } elsif ($self->{state} == ENTITY_STATE) {
4660     if ($is_space->{$self->{nc}} or
4661     {
4662     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4663     $self->{entity_add} => 1,
4664     }->{$self->{nc}}) {
4665 wakaba 1.22 if ($self->{is_xml}) {
4666    
4667     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4668     line => $self->{line_prev},
4669     column => $self->{column_prev}
4670     + ($self->{nc} == -1 ? 1 : 0));
4671     } else {
4672    
4673     ## No error
4674     }
4675 wakaba 1.1 ## Don't consume
4676     ## Return nothing.
4677     #
4678     } elsif ($self->{nc} == 0x0023) { # #
4679    
4680     $self->{state} = ENTITY_HASH_STATE;
4681 wakaba 1.12 $self->{kwd} = '#';
4682 wakaba 1.1
4683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4684     $self->{line_prev} = $self->{line};
4685     $self->{column_prev} = $self->{column};
4686     $self->{column}++;
4687     $self->{nc}
4688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4689     } else {
4690     $self->{set_nc}->($self);
4691     }
4692    
4693     redo A;
4694 wakaba 1.22 } elsif ($self->{is_xml} or
4695     (0x0041 <= $self->{nc} and
4696 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4697     (0x0061 <= $self->{nc} and
4698     $self->{nc} <= 0x007A)) { # a..z
4699    
4700     require Whatpm::_NamedEntityList;
4701     $self->{state} = ENTITY_NAME_STATE;
4702 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4703     $self->{entity__value} = $self->{kwd};
4704 wakaba 1.1 $self->{entity__match} = 0;
4705    
4706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4707     $self->{line_prev} = $self->{line};
4708     $self->{column_prev} = $self->{column};
4709     $self->{column}++;
4710     $self->{nc}
4711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4712     } else {
4713     $self->{set_nc}->($self);
4714     }
4715    
4716     redo A;
4717     } else {
4718    
4719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4720     ## Return nothing.
4721     #
4722     }
4723    
4724     ## NOTE: No character is consumed by the "consume a character
4725     ## reference" algorithm. In other word, there is an "&" character
4726     ## that does not introduce a character reference, which would be
4727     ## appended to the parent element or the attribute value in later
4728     ## process of the tokenizer.
4729    
4730     if ($self->{prev_state} == DATA_STATE) {
4731    
4732     $self->{state} = $self->{prev_state};
4733 wakaba 1.5 $self->{s_kwd} = '';
4734 wakaba 1.1 ## Reconsume.
4735     return ({type => CHARACTER_TOKEN, data => '&',
4736     line => $self->{line_prev},
4737     column => $self->{column_prev},
4738     });
4739     redo A;
4740     } else {
4741    
4742     $self->{ca}->{value} .= '&';
4743     $self->{state} = $self->{prev_state};
4744 wakaba 1.5 $self->{s_kwd} = '';
4745 wakaba 1.1 ## Reconsume.
4746     redo A;
4747     }
4748     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4749 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4750 wakaba 1.1
4751     $self->{state} = HEXREF_X_STATE;
4752 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4753 wakaba 1.1
4754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4755     $self->{line_prev} = $self->{line};
4756     $self->{column_prev} = $self->{column};
4757     $self->{column}++;
4758     $self->{nc}
4759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4760     } else {
4761     $self->{set_nc}->($self);
4762     }
4763    
4764     redo A;
4765 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4766    
4767     if ($self->{is_xml}) {
4768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4769     }
4770     $self->{state} = HEXREF_X_STATE;
4771     $self->{kwd} .= chr $self->{nc};
4772    
4773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4774     $self->{line_prev} = $self->{line};
4775     $self->{column_prev} = $self->{column};
4776     $self->{column}++;
4777     $self->{nc}
4778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4779     } else {
4780     $self->{set_nc}->($self);
4781     }
4782    
4783     redo A;
4784 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4785     $self->{nc} <= 0x0039) { # 0..9
4786    
4787     $self->{state} = NCR_NUM_STATE;
4788 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4789 wakaba 1.1
4790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4791     $self->{line_prev} = $self->{line};
4792     $self->{column_prev} = $self->{column};
4793     $self->{column}++;
4794     $self->{nc}
4795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4796     } else {
4797     $self->{set_nc}->($self);
4798     }
4799    
4800     redo A;
4801     } else {
4802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4803     line => $self->{line_prev},
4804     column => $self->{column_prev} - 1);
4805    
4806     ## NOTE: According to the spec algorithm, nothing is returned,
4807     ## and then "&#" is appended to the parent element or the attribute
4808     ## value in the later processing.
4809    
4810     if ($self->{prev_state} == DATA_STATE) {
4811    
4812     $self->{state} = $self->{prev_state};
4813 wakaba 1.5 $self->{s_kwd} = '';
4814 wakaba 1.1 ## Reconsume.
4815     return ({type => CHARACTER_TOKEN,
4816     data => '&#',
4817     line => $self->{line_prev},
4818     column => $self->{column_prev} - 1,
4819     });
4820     redo A;
4821     } else {
4822    
4823     $self->{ca}->{value} .= '&#';
4824     $self->{state} = $self->{prev_state};
4825 wakaba 1.5 $self->{s_kwd} = '';
4826 wakaba 1.1 ## Reconsume.
4827     redo A;
4828     }
4829     }
4830     } elsif ($self->{state} == NCR_NUM_STATE) {
4831     if (0x0030 <= $self->{nc} and
4832     $self->{nc} <= 0x0039) { # 0..9
4833    
4834 wakaba 1.12 $self->{kwd} *= 10;
4835     $self->{kwd} += $self->{nc} - 0x0030;
4836 wakaba 1.1
4837     ## Stay in the state.
4838    
4839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4840     $self->{line_prev} = $self->{line};
4841     $self->{column_prev} = $self->{column};
4842     $self->{column}++;
4843     $self->{nc}
4844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4845     } else {
4846     $self->{set_nc}->($self);
4847     }
4848    
4849     redo A;
4850     } elsif ($self->{nc} == 0x003B) { # ;
4851    
4852    
4853     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4854     $self->{line_prev} = $self->{line};
4855     $self->{column_prev} = $self->{column};
4856     $self->{column}++;
4857     $self->{nc}
4858     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4859     } else {
4860     $self->{set_nc}->($self);
4861     }
4862    
4863     #
4864     } else {
4865    
4866     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4867     ## Reconsume.
4868     #
4869     }
4870    
4871 wakaba 1.12 my $code = $self->{kwd};
4872 wakaba 1.1 my $l = $self->{line_prev};
4873     my $c = $self->{column_prev};
4874 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4875     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4876     ($self->{is_xml} and $code == 0x0000)) {
4877 wakaba 1.1
4878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4879     text => (sprintf 'U+%04X', $code),
4880     line => $l, column => $c);
4881     $code = $charref_map->{$code};
4882     } elsif ($code > 0x10FFFF) {
4883    
4884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4885     text => (sprintf 'U-%08X', $code),
4886     line => $l, column => $c);
4887     $code = 0xFFFD;
4888     }
4889    
4890     if ($self->{prev_state} == DATA_STATE) {
4891    
4892     $self->{state} = $self->{prev_state};
4893 wakaba 1.5 $self->{s_kwd} = '';
4894 wakaba 1.1 ## Reconsume.
4895     return ({type => CHARACTER_TOKEN, data => chr $code,
4896 wakaba 1.7 has_reference => 1,
4897 wakaba 1.1 line => $l, column => $c,
4898     });
4899     redo A;
4900     } else {
4901    
4902     $self->{ca}->{value} .= chr $code;
4903     $self->{ca}->{has_reference} = 1;
4904     $self->{state} = $self->{prev_state};
4905 wakaba 1.5 $self->{s_kwd} = '';
4906 wakaba 1.1 ## Reconsume.
4907     redo A;
4908     }
4909     } elsif ($self->{state} == HEXREF_X_STATE) {
4910     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4911     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4912     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4913     # 0..9, A..F, a..f
4914    
4915     $self->{state} = HEXREF_HEX_STATE;
4916 wakaba 1.12 $self->{kwd} = 0;
4917 wakaba 1.1 ## Reconsume.
4918     redo A;
4919     } else {
4920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4921     line => $self->{line_prev},
4922     column => $self->{column_prev} - 2);
4923    
4924     ## NOTE: According to the spec algorithm, nothing is returned,
4925     ## and then "&#" followed by "X" or "x" is appended to the parent
4926     ## element or the attribute value in the later processing.
4927    
4928     if ($self->{prev_state} == DATA_STATE) {
4929    
4930     $self->{state} = $self->{prev_state};
4931 wakaba 1.5 $self->{s_kwd} = '';
4932 wakaba 1.1 ## Reconsume.
4933     return ({type => CHARACTER_TOKEN,
4934 wakaba 1.12 data => '&' . $self->{kwd},
4935 wakaba 1.1 line => $self->{line_prev},
4936 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4937 wakaba 1.1 });
4938     redo A;
4939     } else {
4940    
4941 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4942 wakaba 1.1 $self->{state} = $self->{prev_state};
4943 wakaba 1.5 $self->{s_kwd} = '';
4944 wakaba 1.1 ## Reconsume.
4945     redo A;
4946     }
4947     }
4948     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4949     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4950     # 0..9
4951    
4952 wakaba 1.12 $self->{kwd} *= 0x10;
4953     $self->{kwd} += $self->{nc} - 0x0030;
4954 wakaba 1.1 ## Stay in the state.
4955    
4956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957     $self->{line_prev} = $self->{line};
4958     $self->{column_prev} = $self->{column};
4959     $self->{column}++;
4960     $self->{nc}
4961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962     } else {
4963     $self->{set_nc}->($self);
4964     }
4965    
4966     redo A;
4967     } elsif (0x0061 <= $self->{nc} and
4968     $self->{nc} <= 0x0066) { # a..f
4969    
4970 wakaba 1.12 $self->{kwd} *= 0x10;
4971     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4972 wakaba 1.1 ## Stay in the state.
4973    
4974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4975     $self->{line_prev} = $self->{line};
4976     $self->{column_prev} = $self->{column};
4977     $self->{column}++;
4978     $self->{nc}
4979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4980     } else {
4981     $self->{set_nc}->($self);
4982     }
4983    
4984     redo A;
4985     } elsif (0x0041 <= $self->{nc} and
4986     $self->{nc} <= 0x0046) { # A..F
4987    
4988 wakaba 1.12 $self->{kwd} *= 0x10;
4989     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4990 wakaba 1.1 ## Stay in the state.
4991    
4992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4993     $self->{line_prev} = $self->{line};
4994     $self->{column_prev} = $self->{column};
4995     $self->{column}++;
4996     $self->{nc}
4997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4998     } else {
4999     $self->{set_nc}->($self);
5000     }
5001    
5002     redo A;
5003     } elsif ($self->{nc} == 0x003B) { # ;
5004    
5005    
5006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5007     $self->{line_prev} = $self->{line};
5008     $self->{column_prev} = $self->{column};
5009     $self->{column}++;
5010     $self->{nc}
5011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5012     } else {
5013     $self->{set_nc}->($self);
5014     }
5015    
5016     #
5017     } else {
5018    
5019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5020     line => $self->{line},
5021     column => $self->{column});
5022     ## Reconsume.
5023     #
5024     }
5025    
5026 wakaba 1.12 my $code = $self->{kwd};
5027 wakaba 1.1 my $l = $self->{line_prev};
5028     my $c = $self->{column_prev};
5029 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5030     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5031     ($self->{is_xml} and $code == 0x0000)) {
5032 wakaba 1.1
5033     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5034     text => (sprintf 'U+%04X', $code),
5035     line => $l, column => $c);
5036     $code = $charref_map->{$code};
5037     } elsif ($code > 0x10FFFF) {
5038    
5039     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5040     text => (sprintf 'U-%08X', $code),
5041     line => $l, column => $c);
5042     $code = 0xFFFD;
5043     }
5044    
5045     if ($self->{prev_state} == DATA_STATE) {
5046    
5047     $self->{state} = $self->{prev_state};
5048 wakaba 1.5 $self->{s_kwd} = '';
5049 wakaba 1.1 ## Reconsume.
5050     return ({type => CHARACTER_TOKEN, data => chr $code,
5051 wakaba 1.7 has_reference => 1,
5052 wakaba 1.1 line => $l, column => $c,
5053     });
5054     redo A;
5055     } else {
5056    
5057     $self->{ca}->{value} .= chr $code;
5058     $self->{ca}->{has_reference} = 1;
5059     $self->{state} = $self->{prev_state};
5060 wakaba 1.5 $self->{s_kwd} = '';
5061 wakaba 1.1 ## Reconsume.
5062     redo A;
5063     }
5064     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5065 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5066     $self->{nc} <= 0x005A) or # x
5067     (0x0061 <= $self->{nc} and # a
5068     $self->{nc} <= 0x007A) or # z
5069     (0x0030 <= $self->{nc} and # 0
5070     $self->{nc} <= 0x0039) or # 9
5071 wakaba 1.22 $self->{nc} == 0x003B or # ;
5072     ($self->{is_xml} and
5073     not ($is_space->{$self->{nc}} or
5074     {
5075     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5076     $self->{entity_add} => 1,
5077     }->{$self->{nc}}))) {
5078 wakaba 1.1 our $EntityChar;
5079 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5080 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5081     $self->{ge}->{$self->{kwd}}) {
5082 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5083 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5084     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5085    
5086     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5087     } else {
5088     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5089    
5090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5091     value => $self->{kwd});
5092     } else {
5093    
5094     }
5095     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5096     }
5097     } else {
5098     if ($self->{is_xml}) {
5099    
5100     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5101     value => $self->{kwd},
5102     level => {
5103     'amp;' => $self->{level}->{warn},
5104     'quot;' => $self->{level}->{warn},
5105     'lt;' => $self->{level}->{warn},
5106     'gt;' => $self->{level}->{warn},
5107     'apos;' => $self->{level}->{warn},
5108     }->{$self->{kwd}} ||
5109     $self->{level}->{must});
5110     } else {
5111    
5112     }
5113     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5114     }
5115 wakaba 1.1 $self->{entity__match} = 1;
5116    
5117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5118     $self->{line_prev} = $self->{line};
5119     $self->{column_prev} = $self->{column};
5120     $self->{column}++;
5121     $self->{nc}
5122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5123     } else {
5124     $self->{set_nc}->($self);
5125     }
5126    
5127     #
5128     } else {
5129    
5130 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5131 wakaba 1.1 $self->{entity__match} = -1;
5132     ## Stay in the state.
5133    
5134     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5135     $self->{line_prev} = $self->{line};
5136     $self->{column_prev} = $self->{column};
5137     $self->{column}++;
5138     $self->{nc}
5139     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5140     } else {
5141     $self->{set_nc}->($self);
5142     }
5143    
5144     redo A;
5145     }
5146     } else {
5147    
5148     $self->{entity__value} .= chr $self->{nc};
5149     $self->{entity__match} *= 2;
5150     ## Stay in the state.
5151    
5152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5153     $self->{line_prev} = $self->{line};
5154     $self->{column_prev} = $self->{column};
5155     $self->{column}++;
5156     $self->{nc}
5157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5158     } else {
5159     $self->{set_nc}->($self);
5160     }
5161    
5162     redo A;
5163     }
5164     }
5165    
5166     my $data;
5167     my $has_ref;
5168     if ($self->{entity__match} > 0) {
5169    
5170     $data = $self->{entity__value};
5171     $has_ref = 1;
5172     #
5173     } elsif ($self->{entity__match} < 0) {
5174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5175     if ($self->{prev_state} != DATA_STATE and # in attribute
5176     $self->{entity__match} < -1) {
5177    
5178 wakaba 1.12 $data = '&' . $self->{kwd};
5179 wakaba 1.1 #
5180     } else {
5181    
5182     $data = $self->{entity__value};
5183     $has_ref = 1;
5184     #
5185     }
5186     } else {
5187    
5188     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5189     line => $self->{line_prev},
5190 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5191     $data = '&' . $self->{kwd};
5192 wakaba 1.1 #
5193     }
5194    
5195     ## NOTE: In these cases, when a character reference is found,
5196     ## it is consumed and a character token is returned, or, otherwise,
5197     ## nothing is consumed and returned, according to the spec algorithm.
5198     ## In this implementation, anything that has been examined by the
5199     ## tokenizer is appended to the parent element or the attribute value
5200     ## as string, either literal string when no character reference or
5201     ## entity-replaced string otherwise, in this stage, since any characters
5202     ## that would not be consumed are appended in the data state or in an
5203     ## appropriate attribute value state anyway.
5204    
5205     if ($self->{prev_state} == DATA_STATE) {
5206    
5207     $self->{state} = $self->{prev_state};
5208 wakaba 1.5 $self->{s_kwd} = '';
5209 wakaba 1.1 ## Reconsume.
5210     return ({type => CHARACTER_TOKEN,
5211     data => $data,
5212 wakaba 1.7 has_reference => $has_ref,
5213 wakaba 1.1 line => $self->{line_prev},
5214 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5215 wakaba 1.1 });
5216     redo A;
5217     } else {
5218    
5219     $self->{ca}->{value} .= $data;
5220     $self->{ca}->{has_reference} = 1 if $has_ref;
5221     $self->{state} = $self->{prev_state};
5222 wakaba 1.5 $self->{s_kwd} = '';
5223 wakaba 1.1 ## Reconsume.
5224     redo A;
5225     }
5226 wakaba 1.8
5227     ## XML-only states
5228    
5229     } elsif ($self->{state} == PI_STATE) {
5230 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5231    
5232 wakaba 1.8 if ($is_space->{$self->{nc}} or
5233 wakaba 1.14 $self->{nc} == 0x003F or # ?
5234 wakaba 1.8 $self->{nc} == -1) {
5235 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5236     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5237     ## "DOCTYPE pi state": Parse error, switch to the "data
5238     ## state".
5239 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5240     line => $self->{line_prev},
5241     column => $self->{column_prev}
5242     - 1 * ($self->{nc} != -1));
5243     $self->{state} = BOGUS_COMMENT_STATE;
5244     ## Reconsume.
5245     $self->{ct} = {type => COMMENT_TOKEN,
5246     data => '?',
5247     line => $self->{line_prev},
5248     column => $self->{column_prev}
5249     - 1 * ($self->{nc} != -1),
5250     };
5251     redo A;
5252     } else {
5253 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5254 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5255     target => chr $self->{nc},
5256     data => '',
5257     line => $self->{line_prev},
5258     column => $self->{column_prev} - 1,
5259     };
5260     $self->{state} = PI_TARGET_STATE;
5261    
5262     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5263     $self->{line_prev} = $self->{line};
5264     $self->{column_prev} = $self->{column};
5265     $self->{column}++;
5266     $self->{nc}
5267     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5268     } else {
5269     $self->{set_nc}->($self);
5270     }
5271    
5272     redo A;
5273     }
5274     } elsif ($self->{state} == PI_TARGET_STATE) {
5275     if ($is_space->{$self->{nc}}) {
5276     $self->{state} = PI_TARGET_AFTER_STATE;
5277    
5278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5279     $self->{line_prev} = $self->{line};
5280     $self->{column_prev} = $self->{column};
5281     $self->{column}++;
5282     $self->{nc}
5283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5284     } else {
5285     $self->{set_nc}->($self);
5286     }
5287    
5288     redo A;
5289     } elsif ($self->{nc} == -1) {
5290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5291 wakaba 1.13 if ($self->{in_subset}) {
5292     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5293     } else {
5294     $self->{state} = DATA_STATE;
5295     $self->{s_kwd} = '';
5296     }
5297 wakaba 1.8 ## Reconsume.
5298     return ($self->{ct}); # pi
5299     redo A;
5300     } elsif ($self->{nc} == 0x003F) { # ?
5301     $self->{state} = PI_AFTER_STATE;
5302    
5303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5304     $self->{line_prev} = $self->{line};
5305     $self->{column_prev} = $self->{column};
5306     $self->{column}++;
5307     $self->{nc}
5308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5309     } else {
5310     $self->{set_nc}->($self);
5311     }
5312    
5313     redo A;
5314     } else {
5315     ## XML5: typo ("tag name" -> "target")
5316     $self->{ct}->{target} .= chr $self->{nc}; # pi
5317    
5318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319     $self->{line_prev} = $self->{line};
5320     $self->{column_prev} = $self->{column};
5321     $self->{column}++;
5322     $self->{nc}
5323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324     } else {
5325     $self->{set_nc}->($self);
5326     }
5327    
5328     redo A;
5329     }
5330     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5331     if ($is_space->{$self->{nc}}) {
5332     ## Stay in the state.
5333    
5334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5335     $self->{line_prev} = $self->{line};
5336     $self->{column_prev} = $self->{column};
5337     $self->{column}++;
5338     $self->{nc}
5339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5340     } else {
5341     $self->{set_nc}->($self);
5342     }
5343    
5344     redo A;
5345     } else {
5346     $self->{state} = PI_DATA_STATE;
5347     ## Reprocess.
5348     redo A;
5349     }
5350     } elsif ($self->{state} == PI_DATA_STATE) {
5351     if ($self->{nc} == 0x003F) { # ?
5352     $self->{state} = PI_DATA_AFTER_STATE;
5353    
5354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5355     $self->{line_prev} = $self->{line};
5356     $self->{column_prev} = $self->{column};
5357     $self->{column}++;
5358     $self->{nc}
5359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5360     } else {
5361     $self->{set_nc}->($self);
5362     }
5363    
5364     redo A;
5365     } elsif ($self->{nc} == -1) {
5366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5367 wakaba 1.13 if ($self->{in_subset}) {
5368 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5369 wakaba 1.13 } else {
5370     $self->{state} = DATA_STATE;
5371     $self->{s_kwd} = '';
5372     }
5373 wakaba 1.8 ## Reprocess.
5374     return ($self->{ct}); # pi
5375     redo A;
5376     } else {
5377     $self->{ct}->{data} .= chr $self->{nc}; # pi
5378     $self->{read_until}->($self->{ct}->{data}, q[?],
5379     length $self->{ct}->{data});
5380     ## Stay in the state.
5381    
5382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5383     $self->{line_prev} = $self->{line};
5384     $self->{column_prev} = $self->{column};
5385     $self->{column}++;
5386     $self->{nc}
5387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5388     } else {
5389     $self->{set_nc}->($self);
5390     }
5391    
5392     ## Reprocess.
5393     redo A;
5394     }
5395     } elsif ($self->{state} == PI_AFTER_STATE) {
5396 wakaba 1.14 ## XML5: Part of "Pi after state".
5397    
5398 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5399 wakaba 1.13 if ($self->{in_subset}) {
5400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5401     } else {
5402     $self->{state} = DATA_STATE;
5403     $self->{s_kwd} = '';
5404     }
5405 wakaba 1.8
5406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5407     $self->{line_prev} = $self->{line};
5408     $self->{column_prev} = $self->{column};
5409     $self->{column}++;
5410     $self->{nc}
5411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5412     } else {
5413     $self->{set_nc}->($self);
5414     }
5415    
5416     return ($self->{ct}); # pi
5417     redo A;
5418     } elsif ($self->{nc} == 0x003F) { # ?
5419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5420     line => $self->{line_prev},
5421     column => $self->{column_prev}); ## XML5: no error
5422     $self->{ct}->{data} .= '?';
5423     $self->{state} = PI_DATA_AFTER_STATE;
5424    
5425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5426     $self->{line_prev} = $self->{line};
5427     $self->{column_prev} = $self->{column};
5428     $self->{column}++;
5429     $self->{nc}
5430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5431     } else {
5432     $self->{set_nc}->($self);
5433     }
5434    
5435     redo A;
5436     } else {
5437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5438     line => $self->{line_prev},
5439     column => $self->{column_prev}
5440     + 1 * ($self->{nc} == -1)); ## XML5: no error
5441     $self->{ct}->{data} .= '?'; ## XML5: not appended
5442     $self->{state} = PI_DATA_STATE;
5443     ## Reprocess.
5444     redo A;
5445     }
5446     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5447 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5448    
5449 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5450 wakaba 1.13 if ($self->{in_subset}) {
5451     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5452     } else {
5453     $self->{state} = DATA_STATE;
5454     $self->{s_kwd} = '';
5455     }
5456 wakaba 1.8
5457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458     $self->{line_prev} = $self->{line};
5459     $self->{column_prev} = $self->{column};
5460     $self->{column}++;
5461     $self->{nc}
5462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463     } else {
5464     $self->{set_nc}->($self);
5465     }
5466    
5467     return ($self->{ct}); # pi
5468     redo A;
5469     } elsif ($self->{nc} == 0x003F) { # ?
5470     $self->{ct}->{data} .= '?';
5471     ## Stay in the state.
5472    
5473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474     $self->{line_prev} = $self->{line};
5475     $self->{column_prev} = $self->{column};
5476     $self->{column}++;
5477     $self->{nc}
5478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479     } else {
5480     $self->{set_nc}->($self);
5481     }
5482    
5483     redo A;
5484     } else {
5485     $self->{ct}->{data} .= '?'; ## XML5: not appended
5486     $self->{state} = PI_DATA_STATE;
5487     ## Reprocess.
5488     redo A;
5489     }
5490 wakaba 1.12
5491     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5492     if ($self->{nc} == 0x003C) { # <
5493 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5494 wakaba 1.12
5495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5496     $self->{line_prev} = $self->{line};
5497     $self->{column_prev} = $self->{column};
5498     $self->{column}++;
5499     $self->{nc}
5500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5501     } else {
5502     $self->{set_nc}->($self);
5503     }
5504    
5505     redo A;
5506     } elsif ($self->{nc} == 0x0025) { # %
5507     ## XML5: Not defined yet.
5508    
5509     ## TODO:
5510 wakaba 1.24
5511     if (not $self->{stop_processing} and
5512     not $self->{document}->xml_standalone) {
5513     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5514     level => $self->{level}->{info});
5515     $self->{stop_processing} = 1;
5516     }
5517    
5518 wakaba 1.12
5519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520     $self->{line_prev} = $self->{line};
5521     $self->{column_prev} = $self->{column};
5522     $self->{column}++;
5523     $self->{nc}
5524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525     } else {
5526     $self->{set_nc}->($self);
5527     }
5528    
5529     redo A;
5530     } elsif ($self->{nc} == 0x005D) { # ]
5531 wakaba 1.13 delete $self->{in_subset};
5532 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5533    
5534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5535     $self->{line_prev} = $self->{line};
5536     $self->{column_prev} = $self->{column};
5537     $self->{column}++;
5538     $self->{nc}
5539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5540     } else {
5541     $self->{set_nc}->($self);
5542     }
5543    
5544     redo A;
5545     } elsif ($is_space->{$self->{nc}}) {
5546     ## Stay in the state.
5547    
5548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5549     $self->{line_prev} = $self->{line};
5550     $self->{column_prev} = $self->{column};
5551     $self->{column}++;
5552     $self->{nc}
5553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5554     } else {
5555     $self->{set_nc}->($self);
5556     }
5557    
5558     redo A;
5559     } elsif ($self->{nc} == -1) {
5560     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5561 wakaba 1.13 delete $self->{in_subset};
5562 wakaba 1.12 $self->{state} = DATA_STATE;
5563     $self->{s_kwd} = '';
5564     ## Reconsume.
5565 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5566 wakaba 1.12 redo A;
5567     } else {
5568     unless ($self->{internal_subset_tainted}) {
5569     ## XML5: No parse error.
5570     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5571     $self->{internal_subset_tainted} = 1;
5572     }
5573     ## Stay in the state.
5574    
5575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5576     $self->{line_prev} = $self->{line};
5577     $self->{column_prev} = $self->{column};
5578     $self->{column}++;
5579     $self->{nc}
5580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5581     } else {
5582     $self->{set_nc}->($self);
5583     }
5584    
5585     redo A;
5586     }
5587     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5588     if ($self->{nc} == 0x003E) { # >
5589     $self->{state} = DATA_STATE;
5590     $self->{s_kwd} = '';
5591    
5592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593     $self->{line_prev} = $self->{line};
5594     $self->{column_prev} = $self->{column};
5595     $self->{column}++;
5596     $self->{nc}
5597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598     } else {
5599     $self->{set_nc}->($self);
5600     }
5601    
5602 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5603 wakaba 1.12 redo A;
5604     } elsif ($self->{nc} == -1) {
5605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5606     $self->{state} = DATA_STATE;
5607     $self->{s_kwd} = '';
5608     ## Reconsume.
5609 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5610 wakaba 1.12 redo A;
5611     } else {
5612     ## XML5: No parse error and stay in the state.
5613     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5614    
5615 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5616    
5617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618     $self->{line_prev} = $self->{line};
5619     $self->{column_prev} = $self->{column};
5620     $self->{column}++;
5621     $self->{nc}
5622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623     } else {
5624     $self->{set_nc}->($self);
5625     }
5626    
5627     redo A;
5628     }
5629     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5630     if ($self->{nc} == 0x003E) { # >
5631     $self->{state} = DATA_STATE;
5632     $self->{s_kwd} = '';
5633    
5634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635     $self->{line_prev} = $self->{line};
5636     $self->{column_prev} = $self->{column};
5637     $self->{column}++;
5638     $self->{nc}
5639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640     } else {
5641     $self->{set_nc}->($self);
5642     }
5643    
5644     return ({type => END_OF_DOCTYPE_TOKEN});
5645     redo A;
5646     } elsif ($self->{nc} == -1) {
5647     $self->{state} = DATA_STATE;
5648     $self->{s_kwd} = '';
5649     ## Reconsume.
5650     return ({type => END_OF_DOCTYPE_TOKEN});
5651     redo A;
5652     } else {
5653     ## Stay in the state.
5654    
5655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5656     $self->{line_prev} = $self->{line};
5657     $self->{column_prev} = $self->{column};
5658     $self->{column}++;
5659     $self->{nc}
5660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5661     } else {
5662     $self->{set_nc}->($self);
5663     }
5664    
5665     redo A;
5666     }
5667     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5668     if ($self->{nc} == 0x0021) { # !
5669 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5670 wakaba 1.13
5671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5672     $self->{line_prev} = $self->{line};
5673     $self->{column_prev} = $self->{column};
5674     $self->{column}++;
5675     $self->{nc}
5676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5677     } else {
5678     $self->{set_nc}->($self);
5679     }
5680    
5681     redo A;
5682     } elsif ($self->{nc} == 0x003F) { # ?
5683     $self->{state} = PI_STATE;
5684    
5685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5686     $self->{line_prev} = $self->{line};
5687     $self->{column_prev} = $self->{column};
5688     $self->{column}++;
5689     $self->{nc}
5690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5691     } else {
5692     $self->{set_nc}->($self);
5693     }
5694    
5695     redo A;
5696     } elsif ($self->{nc} == -1) {
5697     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5698     $self->{state} = DATA_STATE;
5699     $self->{s_kwd} = '';
5700     ## Reconsume.
5701     redo A;
5702     } else {
5703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5704     line => $self->{line_prev},
5705     column => $self->{column_prev});
5706     $self->{state} = BOGUS_COMMENT_STATE;
5707     $self->{ct} = {type => COMMENT_TOKEN,
5708     data => '',
5709     }; ## NOTE: Will be discarded.
5710 wakaba 1.12
5711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5712     $self->{line_prev} = $self->{line};
5713     $self->{column_prev} = $self->{column};
5714     $self->{column}++;
5715     $self->{nc}
5716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5717     } else {
5718     $self->{set_nc}->($self);
5719     }
5720    
5721     redo A;
5722     }
5723 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5724     ## XML5: "DOCTYPE markup declaration state".
5725    
5726     if ($self->{nc} == 0x002D) { # -
5727     $self->{state} = MD_HYPHEN_STATE;
5728    
5729     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5730     $self->{line_prev} = $self->{line};
5731     $self->{column_prev} = $self->{column};
5732     $self->{column}++;
5733     $self->{nc}
5734     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5735     } else {
5736     $self->{set_nc}->($self);
5737     }
5738    
5739     redo A;
5740 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5741     $self->{nc} == 0x0065) { # e
5742 wakaba 1.14 $self->{state} = MD_E_STATE;
5743     $self->{kwd} = chr $self->{nc};
5744    
5745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746     $self->{line_prev} = $self->{line};
5747     $self->{column_prev} = $self->{column};
5748     $self->{column}++;
5749     $self->{nc}
5750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751     } else {
5752     $self->{set_nc}->($self);
5753     }
5754    
5755     redo A;
5756 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5757     $self->{nc} == 0x0061) { # a
5758 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5759     $self->{kwd} = chr $self->{nc};
5760    
5761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5762     $self->{line_prev} = $self->{line};
5763     $self->{column_prev} = $self->{column};
5764     $self->{column}++;
5765     $self->{nc}
5766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5767     } else {
5768     $self->{set_nc}->($self);
5769     }
5770    
5771     redo A;
5772 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5773     $self->{nc} == 0x006E) { # n
5774 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5775     $self->{kwd} = chr $self->{nc};
5776    
5777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778     $self->{line_prev} = $self->{line};
5779     $self->{column_prev} = $self->{column};
5780     $self->{column}++;
5781     $self->{nc}
5782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783     } else {
5784     $self->{set_nc}->($self);
5785     }
5786    
5787     redo A;
5788     } else {
5789     #
5790     }
5791    
5792     ## XML5: No parse error.
5793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5794     line => $self->{line_prev},
5795     column => $self->{column_prev} - 1);
5796     ## Reconsume.
5797     $self->{state} = BOGUS_COMMENT_STATE;
5798     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5799     redo A;
5800     } elsif ($self->{state} == MD_E_STATE) {
5801 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5802     $self->{nc} == 0x006E) { # n
5803 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5804     $self->{kwd} .= chr $self->{nc};
5805    
5806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5807     $self->{line_prev} = $self->{line};
5808     $self->{column_prev} = $self->{column};
5809     $self->{column}++;
5810     $self->{nc}
5811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5812     } else {
5813     $self->{set_nc}->($self);
5814     }
5815    
5816     redo A;
5817 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5818     $self->{nc} == 0x006C) { # l
5819 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5820     $self->{state} = MD_ELEMENT_STATE;
5821     $self->{kwd} .= chr $self->{nc};
5822    
5823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824     $self->{line_prev} = $self->{line};
5825     $self->{column_prev} = $self->{column};
5826     $self->{column}++;
5827     $self->{nc}
5828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829     } else {
5830     $self->{set_nc}->($self);
5831     }
5832    
5833     redo A;
5834     } else {
5835     ## XML5: No parse error.
5836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5837     line => $self->{line_prev},
5838     column => $self->{column_prev} - 2
5839     + 1 * ($self->{nc} == -1));
5840     ## Reconsume.
5841     $self->{state} = BOGUS_COMMENT_STATE;
5842     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5843     redo A;
5844     }
5845     } elsif ($self->{state} == MD_ENTITY_STATE) {
5846 wakaba 1.17 if ($self->{nc} == [
5847     undef,
5848     undef,
5849     0x0054, # T
5850     0x0049, # I
5851     0x0054, # T
5852     ]->[length $self->{kwd}] or
5853     $self->{nc} == [
5854     undef,
5855     undef,
5856     0x0074, # t
5857     0x0069, # i
5858     0x0074, # t
5859     ]->[length $self->{kwd}]) {
5860 wakaba 1.14 ## Stay in the state.
5861     $self->{kwd} .= chr $self->{nc};
5862    
5863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864     $self->{line_prev} = $self->{line};
5865     $self->{column_prev} = $self->{column};
5866     $self->{column}++;
5867     $self->{nc}
5868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869     } else {
5870     $self->{set_nc}->($self);
5871     }
5872    
5873     redo A;
5874 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5875     ($self->{nc} == 0x0059 or # Y
5876     $self->{nc} == 0x0079)) { # y
5877     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5879     text => 'ENTITY',
5880     line => $self->{line_prev},
5881     column => $self->{column_prev} - 4);
5882     }
5883     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5884 wakaba 1.14 line => $self->{line_prev},
5885     column => $self->{column_prev} - 6};
5886     $self->{state} = DOCTYPE_MD_STATE;
5887    
5888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5889     $self->{line_prev} = $self->{line};
5890     $self->{column_prev} = $self->{column};
5891     $self->{column}++;
5892     $self->{nc}
5893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5894     } else {
5895     $self->{set_nc}->($self);
5896     }
5897    
5898     redo A;
5899     } else {
5900     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5901     line => $self->{line_prev},
5902     column => $self->{column_prev} - 1
5903     - (length $self->{kwd})
5904     + 1 * ($self->{nc} == -1));
5905     $self->{state} = BOGUS_COMMENT_STATE;
5906     ## Reconsume.
5907     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5908     redo A;
5909     }
5910     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5911 wakaba 1.17 if ($self->{nc} == [
5912     undef,
5913     undef,
5914     0x0045, # E
5915     0x004D, # M
5916     0x0045, # E
5917     0x004E, # N
5918     ]->[length $self->{kwd}] or
5919     $self->{nc} == [
5920     undef,
5921     undef,
5922     0x0065, # e
5923     0x006D, # m
5924     0x0065, # e
5925     0x006E, # n
5926     ]->[length $self->{kwd}]) {
5927 wakaba 1.14 ## Stay in the state.
5928     $self->{kwd} .= chr $self->{nc};
5929    
5930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5931     $self->{line_prev} = $self->{line};
5932     $self->{column_prev} = $self->{column};
5933     $self->{column}++;
5934     $self->{nc}
5935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5936     } else {
5937     $self->{set_nc}->($self);
5938     }
5939    
5940     redo A;
5941 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5942     ($self->{nc} == 0x0054 or # T
5943     $self->{nc} == 0x0074)) { # t
5944     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5946     text => 'ELEMENT',
5947     line => $self->{line_prev},
5948     column => $self->{column_prev} - 5);
5949     }
5950 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5951     line => $self->{line_prev},
5952 wakaba 1.23 column => $self->{column_prev} - 7};
5953 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5954    
5955     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5956     $self->{line_prev} = $self->{line};
5957     $self->{column_prev} = $self->{column};
5958     $self->{column}++;
5959     $self->{nc}
5960     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5961     } else {
5962     $self->{set_nc}->($self);
5963     }
5964    
5965     redo A;
5966     } else {
5967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5968     line => $self->{line_prev},
5969     column => $self->{column_prev} - 1
5970     - (length $self->{kwd})
5971     + 1 * ($self->{nc} == -1));
5972     $self->{state} = BOGUS_COMMENT_STATE;
5973     ## Reconsume.
5974     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5975     redo A;
5976     }
5977     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5978 wakaba 1.17 if ($self->{nc} == [
5979     undef,
5980     0x0054, # T
5981     0x0054, # T
5982     0x004C, # L
5983     0x0049, # I
5984     0x0053, # S
5985     ]->[length $self->{kwd}] or
5986     $self->{nc} == [
5987     undef,
5988     0x0074, # t
5989     0x0074, # t
5990     0x006C, # l
5991     0x0069, # i
5992     0x0073, # s
5993     ]->[length $self->{kwd}]) {
5994 wakaba 1.14 ## Stay in the state.
5995     $self->{kwd} .= chr $self->{nc};
5996    
5997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5998     $self->{line_prev} = $self->{line};
5999     $self->{column_prev} = $self->{column};
6000     $self->{column}++;
6001     $self->{nc}
6002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6003     } else {
6004     $self->{set_nc}->($self);
6005     }
6006    
6007     redo A;
6008 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6009     ($self->{nc} == 0x0054 or # T
6010     $self->{nc} == 0x0074)) { # t
6011     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6013     text => 'ATTLIST',
6014     line => $self->{line_prev},
6015     column => $self->{column_prev} - 5);
6016     }
6017 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6018 wakaba 1.15 attrdefs => [],
6019 wakaba 1.14 line => $self->{line_prev},
6020 wakaba 1.23 column => $self->{column_prev} - 7};
6021 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6022    
6023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6024     $self->{line_prev} = $self->{line};
6025     $self->{column_prev} = $self->{column};
6026     $self->{column}++;
6027     $self->{nc}
6028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6029     } else {
6030     $self->{set_nc}->($self);
6031     }
6032    
6033     redo A;
6034     } else {
6035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6036     line => $self->{line_prev},
6037     column => $self->{column_prev} - 1
6038     - (length $self->{kwd})
6039     + 1 * ($self->{nc} == -1));
6040     $self->{state} = BOGUS_COMMENT_STATE;
6041     ## Reconsume.
6042     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6043     redo A;
6044     }
6045     } elsif ($self->{state} == MD_NOTATION_STATE) {
6046 wakaba 1.17 if ($self->{nc} == [
6047     undef,
6048     0x004F, # O
6049     0x0054, # T
6050     0x0041, # A
6051     0x0054, # T
6052     0x0049, # I
6053     0x004F, # O
6054     ]->[length $self->{kwd}] or
6055     $self->{nc} == [
6056     undef,
6057     0x006F, # o
6058     0x0074, # t
6059     0x0061, # a
6060     0x0074, # t
6061     0x0069, # i
6062     0x006F, # o
6063     ]->[length $self->{kwd}]) {
6064 wakaba 1.14 ## Stay in the state.
6065     $self->{kwd} .= chr $self->{nc};
6066    
6067     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6068     $self->{line_prev} = $self->{line};
6069     $self->{column_prev} = $self->{column};
6070     $self->{column}++;
6071     $self->{nc}
6072     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6073     } else {
6074     $self->{set_nc}->($self);
6075     }
6076    
6077     redo A;
6078 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6079     ($self->{nc} == 0x004E or # N
6080     $self->{nc} == 0x006E)) { # n
6081     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6082     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6083     text => 'NOTATION',
6084     line => $self->{line_prev},
6085     column => $self->{column_prev} - 6);
6086     }
6087 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6088     line => $self->{line_prev},
6089 wakaba 1.23 column => $self->{column_prev} - 8};
6090 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6091    
6092     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6093     $self->{line_prev} = $self->{line};
6094     $self->{column_prev} = $self->{column};
6095     $self->{column}++;
6096     $self->{nc}
6097     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6098     } else {
6099     $self->{set_nc}->($self);
6100     }
6101    
6102     redo A;
6103     } else {
6104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6105     line => $self->{line_prev},
6106     column => $self->{column_prev} - 1
6107     - (length $self->{kwd})
6108     + 1 * ($self->{nc} == -1));
6109     $self->{state} = BOGUS_COMMENT_STATE;
6110     ## Reconsume.
6111     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6112     redo A;
6113     }
6114     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6115     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6116     ## "DOCTYPE NOTATION state".
6117    
6118     if ($is_space->{$self->{nc}}) {
6119     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6120     $self->{state} = BEFORE_MD_NAME_STATE;
6121    
6122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6123     $self->{line_prev} = $self->{line};
6124     $self->{column_prev} = $self->{column};
6125     $self->{column}++;
6126     $self->{nc}
6127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6128     } else {
6129     $self->{set_nc}->($self);
6130     }
6131    
6132     redo A;
6133     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6134     $self->{nc} == 0x0025) { # %
6135     ## XML5: Switch to the "DOCTYPE bogus comment state".
6136     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6137     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6138    
6139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140     $self->{line_prev} = $self->{line};
6141     $self->{column_prev} = $self->{column};
6142     $self->{column}++;
6143     $self->{nc}
6144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145     } else {
6146     $self->{set_nc}->($self);
6147     }
6148    
6149     redo A;
6150     } elsif ($self->{nc} == -1) {
6151     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6152     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6153     ## Reconsume.
6154     redo A;
6155     } elsif ($self->{nc} == 0x003E) { # >
6156     ## XML5: Switch to the "DOCTYPE bogus comment state".
6157     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6158     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6159    
6160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6161     $self->{line_prev} = $self->{line};
6162     $self->{column_prev} = $self->{column};
6163     $self->{column}++;
6164     $self->{nc}
6165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6166     } else {
6167     $self->{set_nc}->($self);
6168     }
6169    
6170     redo A;
6171     } else {
6172     ## XML5: Switch to the "DOCTYPE bogus comment state".
6173     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6174     $self->{state} = BEFORE_MD_NAME_STATE;
6175     redo A;
6176     }
6177     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6178     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6179     ## before state", "DOCTYPE ATTLIST name before state".
6180    
6181     if ($is_space->{$self->{nc}}) {
6182     ## Stay in the state.
6183    
6184     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6185     $self->{line_prev} = $self->{line};
6186     $self->{column_prev} = $self->{column};
6187     $self->{column}++;
6188     $self->{nc}
6189     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6190     } else {
6191     $self->{set_nc}->($self);
6192     }
6193    
6194     redo A;
6195     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6196     $self->{nc} == 0x0025) { # %
6197     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6198    
6199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6200     $self->{line_prev} = $self->{line};
6201     $self->{column_prev} = $self->{column};
6202     $self->{column}++;
6203     $self->{nc}
6204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6205     } else {
6206     $self->{set_nc}->($self);
6207     }
6208    
6209     redo A;
6210     } elsif ($self->{nc} == 0x003E) { # >
6211     ## XML5: Same as "Anything else".
6212     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6213     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6214    
6215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6216     $self->{line_prev} = $self->{line};
6217     $self->{column_prev} = $self->{column};
6218     $self->{column}++;
6219     $self->{nc}
6220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6221     } else {
6222     $self->{set_nc}->($self);
6223     }
6224    
6225     redo A;
6226     } elsif ($self->{nc} == -1) {
6227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6229     ## Reconsume.
6230     redo A;
6231     } else {
6232     ## XML5: [ATTLIST] Not defined yet.
6233     $self->{ct}->{name} .= chr $self->{nc};
6234     $self->{state} = MD_NAME_STATE;
6235    
6236     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6237     $self->{line_prev} = $self->{line};
6238     $self->{column_prev} = $self->{column};
6239     $self->{column}++;
6240     $self->{nc}
6241     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6242     } else {
6243     $self->{set_nc}->($self);
6244     }
6245    
6246     redo A;
6247     }
6248     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6249     if ($is_space->{$self->{nc}}) {
6250     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6251     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6252     $self->{state} = BEFORE_MD_NAME_STATE;
6253 wakaba 1.8
6254 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6255     $self->{line_prev} = $self->{line};
6256     $self->{column_prev} = $self->{column};
6257     $self->{column}++;
6258     $self->{nc}
6259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6260     } else {
6261     $self->{set_nc}->($self);
6262     }
6263    
6264     redo A;
6265     } elsif ($self->{nc} == 0x003E) { # >
6266     ## XML5: Same as "Anything else".
6267     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6268     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6269    
6270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6271     $self->{line_prev} = $self->{line};
6272     $self->{column_prev} = $self->{column};
6273     $self->{column}++;
6274     $self->{nc}
6275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6276     } else {
6277     $self->{set_nc}->($self);
6278     }
6279    
6280     redo A;
6281     } elsif ($self->{nc} == -1) {
6282     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6283     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6284     ## Reconsume.
6285     redo A;
6286     } else {
6287     ## XML5: No parse error.
6288     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6289     $self->{state} = BOGUS_COMMENT_STATE;
6290     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6291     ## Reconsume.
6292     redo A;
6293     }
6294     } elsif ($self->{state} == MD_NAME_STATE) {
6295     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6296    
6297     if ($is_space->{$self->{nc}}) {
6298 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6299     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6300     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6301 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6302 wakaba 1.16 } else { # ENTITY/NOTATION
6303     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6304     }
6305 wakaba 1.14
6306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6307     $self->{line_prev} = $self->{line};
6308     $self->{column_prev} = $self->{column};
6309     $self->{column}++;
6310     $self->{nc}
6311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6312     } else {
6313     $self->{set_nc}->($self);
6314     }
6315    
6316     redo A;
6317     } elsif ($self->{nc} == 0x003E) { # >
6318     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6319     #
6320     } else {
6321 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6322 wakaba 1.14 }
6323     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6324    
6325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6326     $self->{line_prev} = $self->{line};
6327     $self->{column_prev} = $self->{column};
6328     $self->{column}++;
6329     $self->{nc}
6330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6331     } else {
6332     $self->{set_nc}->($self);
6333     }
6334    
6335     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6336     redo A;
6337     } elsif ($self->{nc} == -1) {
6338     ## XML5: [ATTLIST] No parse error.
6339     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6340     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6341     ## Reconsume.
6342     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6343     redo A;
6344     } else {
6345     ## XML5: [ATTLIST] Not defined yet.
6346     $self->{ct}->{name} .= chr $self->{nc};
6347     ## Stay in the state.
6348    
6349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6350     $self->{line_prev} = $self->{line};
6351     $self->{column_prev} = $self->{column};
6352     $self->{column}++;
6353     $self->{nc}
6354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6355     } else {
6356     $self->{set_nc}->($self);
6357     }
6358    
6359     redo A;
6360     }
6361     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6362     if ($is_space->{$self->{nc}}) {
6363     ## Stay in the state.
6364    
6365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6366     $self->{line_prev} = $self->{line};
6367     $self->{column_prev} = $self->{column};
6368     $self->{column}++;
6369     $self->{nc}
6370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6371     } else {
6372     $self->{set_nc}->($self);
6373     }
6374    
6375     redo A;
6376     } elsif ($self->{nc} == 0x003E) { # >
6377     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6378    
6379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6380     $self->{line_prev} = $self->{line};
6381     $self->{column_prev} = $self->{column};
6382     $self->{column}++;
6383     $self->{nc}
6384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6385     } else {
6386     $self->{set_nc}->($self);
6387     }
6388    
6389     return ($self->{ct}); # ATTLIST
6390     redo A;
6391     } elsif ($self->{nc} == -1) {
6392     ## XML5: No parse error.
6393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6394     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6395 wakaba 1.15 return ($self->{ct});
6396 wakaba 1.14 redo A;
6397     } else {
6398     ## XML5: Not defined yet.
6399 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6400     tokens => [],
6401     line => $self->{line}, column => $self->{column}};
6402     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6403    
6404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6405     $self->{line_prev} = $self->{line};
6406     $self->{column_prev} = $self->{column};
6407     $self->{column}++;
6408     $self->{nc}
6409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6410     } else {
6411     $self->{set_nc}->($self);
6412     }
6413    
6414     redo A;
6415     }
6416     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6417     if ($is_space->{$self->{nc}}) {
6418     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6419    
6420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6421     $self->{line_prev} = $self->{line};
6422     $self->{column_prev} = $self->{column};
6423     $self->{column}++;
6424     $self->{nc}
6425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6426     } else {
6427     $self->{set_nc}->($self);
6428     }
6429    
6430     redo A;
6431     } elsif ($self->{nc} == 0x003E) { # >
6432     ## XML5: Same as "anything else".
6433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6435    
6436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6437     $self->{line_prev} = $self->{line};
6438     $self->{column_prev} = $self->{column};
6439     $self->{column}++;
6440     $self->{nc}
6441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6442     } else {
6443     $self->{set_nc}->($self);
6444     }
6445    
6446     return ($self->{ct}); # ATTLIST
6447     redo A;
6448     } elsif ($self->{nc} == 0x0028) { # (
6449     ## XML5: Same as "anything else".
6450     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6451     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6452    
6453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454     $self->{line_prev} = $self->{line};
6455     $self->{column_prev} = $self->{column};
6456     $self->{column}++;
6457     $self->{nc}
6458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459     } else {
6460     $self->{set_nc}->($self);
6461     }
6462    
6463     redo A;
6464     } elsif ($self->{nc} == -1) {
6465     ## XML5: No parse error.
6466     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6467     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6468    
6469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470     $self->{line_prev} = $self->{line};
6471     $self->{column_prev} = $self->{column};
6472     $self->{column}++;
6473     $self->{nc}
6474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475     } else {
6476     $self->{set_nc}->($self);
6477     }
6478    
6479     return ($self->{ct}); # ATTLIST
6480     redo A;
6481     } else {
6482     ## XML5: Not defined yet.
6483     $self->{ca}->{name} .= chr $self->{nc};
6484     ## Stay in the state.
6485    
6486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6487     $self->{line_prev} = $self->{line};
6488     $self->{column_prev} = $self->{column};
6489     $self->{column}++;
6490     $self->{nc}
6491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6492     } else {
6493     $self->{set_nc}->($self);
6494     }
6495    
6496 wakaba 1.14 redo A;
6497     }
6498 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6499     if ($is_space->{$self->{nc}}) {
6500     ## Stay in the state.
6501    
6502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503     $self->{line_prev} = $self->{line};
6504     $self->{column_prev} = $self->{column};
6505     $self->{column}++;
6506     $self->{nc}
6507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508     } else {
6509     $self->{set_nc}->($self);
6510     }
6511    
6512     redo A;
6513     } elsif ($self->{nc} == 0x003E) { # >
6514     ## XML5: Same as "anything else".
6515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6516     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6517    
6518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519     $self->{line_prev} = $self->{line};
6520     $self->{column_prev} = $self->{column};
6521     $self->{column}++;
6522     $self->{nc}
6523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524     } else {
6525     $self->{set_nc}->($self);
6526     }
6527    
6528     return ($self->{ct}); # ATTLIST
6529     redo A;
6530     } elsif ($self->{nc} == 0x0028) { # (
6531     ## XML5: Same as "anything else".
6532     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6533    
6534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535     $self->{line_prev} = $self->{line};
6536     $self->{column_prev} = $self->{column};
6537     $self->{column}++;
6538     $self->{nc}
6539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540     } else {
6541     $self->{set_nc}->($self);
6542     }
6543    
6544     redo A;
6545     } elsif ($self->{nc} == -1) {
6546     ## XML5: No parse error.
6547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6548     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549    
6550     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6551     $self->{line_prev} = $self->{line};
6552     $self->{column_prev} = $self->{column};
6553     $self->{column}++;
6554     $self->{nc}
6555     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6556     } else {
6557     $self->{set_nc}->($self);
6558     }
6559    
6560     return ($self->{ct});
6561     redo A;
6562     } else {
6563     ## XML5: Not defined yet.
6564     $self->{ca}->{type} = chr $self->{nc};
6565     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6566    
6567     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568     $self->{line_prev} = $self->{line};
6569     $self->{column_prev} = $self->{column};
6570     $self->{column}++;
6571     $self->{nc}
6572     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573     } else {
6574     $self->{set_nc}->($self);
6575     }
6576    
6577     redo A;
6578     }
6579     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6580     if ($is_space->{$self->{nc}}) {
6581     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6582    
6583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584     $self->{line_prev} = $self->{line};
6585     $self->{column_prev} = $self->{column};
6586     $self->{column}++;
6587     $self->{nc}
6588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589     } else {
6590     $self->{set_nc}->($self);
6591     }
6592    
6593     redo A;
6594     } elsif ($self->{nc} == 0x0023) { # #
6595     ## XML5: Same as "anything else".
6596     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6597     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6598    
6599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600     $self->{line_prev} = $self->{line};
6601     $self->{column_prev} = $self->{column};
6602     $self->{column}++;
6603     $self->{nc}
6604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605     } else {
6606     $self->{set_nc}->($self);
6607     }
6608    
6609     redo A;
6610     } elsif ($self->{nc} == 0x0022) { # "
6611     ## XML5: Same as "anything else".
6612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6613     $self->{ca}->{value} = '';
6614     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6615    
6616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617     $self->{line_prev} = $self->{line};
6618     $self->{column_prev} = $self->{column};
6619     $self->{column}++;
6620     $self->{nc}
6621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622     } else {
6623     $self->{set_nc}->($self);
6624     }
6625    
6626     redo A;
6627     } elsif ($self->{nc} == 0x0027) { # '
6628     ## XML5: Same as "anything else".
6629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6630     $self->{ca}->{value} = '';
6631     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6632    
6633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6634     $self->{line_prev} = $self->{line};
6635     $self->{column_prev} = $self->{column};
6636     $self->{column}++;
6637     $self->{nc}
6638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6639     } else {
6640     $self->{set_nc}->($self);
6641     }
6642    
6643     redo A;
6644     } elsif ($self->{nc} == 0x003E) { # >
6645     ## XML5: Same as "anything else".
6646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6647     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6648    
6649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6650     $self->{line_prev} = $self->{line};
6651     $self->{column_prev} = $self->{column};
6652     $self->{column}++;
6653     $self->{nc}
6654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6655     } else {
6656     $self->{set_nc}->($self);
6657     }
6658    
6659     return ($self->{ct}); # ATTLIST
6660     redo A;
6661     } elsif ($self->{nc} == 0x0028) { # (
6662     ## XML5: Same as "anything else".
6663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6664     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6665    
6666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6667     $self->{line_prev} = $self->{line};
6668     $self->{column_prev} = $self->{column};
6669     $self->{column}++;
6670     $self->{nc}
6671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6672     } else {
6673     $self->{set_nc}->($self);
6674     }
6675    
6676     redo A;
6677     } elsif ($self->{nc} == -1) {
6678     ## XML5: No parse error.
6679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6681    
6682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683     $self->{line_prev} = $self->{line};
6684     $self->{column_prev} = $self->{column};
6685     $self->{column}++;
6686     $self->{nc}
6687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688     } else {
6689     $self->{set_nc}->($self);
6690     }
6691    
6692     return ($self->{ct});
6693     redo A;
6694     } else {
6695     ## XML5: Not defined yet.
6696     $self->{ca}->{type} .= chr $self->{nc};
6697     ## Stay in the state.
6698    
6699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6700     $self->{line_prev} = $self->{line};
6701     $self->{column_prev} = $self->{column};
6702     $self->{column}++;
6703     $self->{nc}
6704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6705     } else {
6706     $self->{set_nc}->($self);
6707     }
6708    
6709     redo A;
6710     }
6711     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6712     if ($is_space->{$self->{nc}}) {
6713     ## Stay in the state.
6714    
6715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716     $self->{line_prev} = $self->{line};
6717     $self->{column_prev} = $self->{column};
6718     $self->{column}++;
6719     $self->{nc}
6720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721     } else {
6722     $self->{set_nc}->($self);
6723     }
6724    
6725     redo A;
6726     } elsif ($self->{nc} == 0x0028) { # (
6727     ## XML5: Same as "anything else".
6728     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6729    
6730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731     $self->{line_prev} = $self->{line};
6732     $self->{column_prev} = $self->{column};
6733     $self->{column}++;
6734     $self->{nc}
6735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736     } else {
6737     $self->{set_nc}->($self);
6738     }
6739    
6740     redo A;
6741     } elsif ($self->{nc} == 0x0023) { # #
6742     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6743    
6744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6745     $self->{line_prev} = $self->{line};
6746     $self->{column_prev} = $self->{column};
6747     $self->{column}++;
6748     $self->{nc}
6749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6750     } else {
6751     $self->{set_nc}->($self);
6752     }
6753    
6754     redo A;
6755     } elsif ($self->{nc} == 0x0022) { # "
6756     ## XML5: Same as "anything else".
6757     $self->{ca}->{value} = '';
6758     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6759    
6760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6761     $self->{line_prev} = $self->{line};
6762     $self->{column_prev} = $self->{column};
6763     $self->{column}++;
6764     $self->{nc}
6765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6766     } else {
6767     $self->{set_nc}->($self);
6768     }
6769    
6770     redo A;
6771     } elsif ($self->{nc} == 0x0027) { # '
6772     ## XML5: Same as "anything else".
6773     $self->{ca}->{value} = '';
6774     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6775    
6776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777     $self->{line_prev} = $self->{line};
6778     $self->{column_prev} = $self->{column};
6779     $self->{column}++;
6780     $self->{nc}
6781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782     } else {
6783     $self->{set_nc}->($self);
6784     }
6785    
6786     redo A;
6787     } elsif ($self->{nc} == 0x003E) { # >
6788     ## XML5: Same as "anything else".
6789     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6790     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6791    
6792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793     $self->{line_prev} = $self->{line};
6794     $self->{column_prev} = $self->{column};
6795     $self->{column}++;
6796     $self->{nc}
6797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798     } else {
6799     $self->{set_nc}->($self);
6800     }
6801    
6802     return ($self->{ct}); # ATTLIST
6803     redo A;
6804     } elsif ($self->{nc} == -1) {
6805     ## XML5: No parse error.
6806     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6807     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6808    
6809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810     $self->{line_prev} = $self->{line};
6811     $self->{column_prev} = $self->{column};
6812     $self->{column}++;
6813     $self->{nc}
6814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815     } else {
6816     $self->{set_nc}->($self);
6817     }
6818    
6819     return ($self->{ct});
6820     redo A;
6821     } else {
6822     ## XML5: Switch to the "DOCTYPE bogus comment state".
6823     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6824     $self->{ca}->{value} = '';
6825     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6826     ## Reconsume.
6827     redo A;
6828     }
6829     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6830     if ($is_space->{$self->{nc}}) {
6831     ## Stay in the state.
6832    
6833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6834     $self->{line_prev} = $self->{line};
6835     $self->{column_prev} = $self->{column};
6836     $self->{column}++;
6837     $self->{nc}
6838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6839     } else {
6840     $self->{set_nc}->($self);
6841     }
6842    
6843     redo A;
6844     } elsif ($self->{nc} == 0x007C) { # |
6845     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6846     ## Stay in the state.
6847    
6848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6849     $self->{line_prev} = $self->{line};
6850     $self->{column_prev} = $self->{column};
6851     $self->{column}++;
6852     $self->{nc}
6853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6854     } else {
6855     $self->{set_nc}->($self);
6856     }
6857    
6858     redo A;
6859     } elsif ($self->{nc} == 0x0029) { # )
6860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6861     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6862    
6863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6864     $self->{line_prev} = $self->{line};
6865     $self->{column_prev} = $self->{column};
6866     $self->{column}++;
6867     $self->{nc}
6868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6869     } else {
6870     $self->{set_nc}->($self);
6871     }
6872    
6873     redo A;
6874     } elsif ($self->{nc} == 0x003E) { # >
6875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6876     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6877    
6878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6879     $self->{line_prev} = $self->{line};
6880     $self->{column_prev} = $self->{column};
6881     $self->{column}++;
6882     $self->{nc}
6883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6884     } else {
6885     $self->{set_nc}->($self);
6886     }
6887    
6888     return ($self->{ct}); # ATTLIST
6889     redo A;
6890     } elsif ($self->{nc} == -1) {
6891     ## XML5: No parse error.
6892     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6893     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6894    
6895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896     $self->{line_prev} = $self->{line};
6897     $self->{column_prev} = $self->{column};
6898     $self->{column}++;
6899     $self->{nc}
6900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901     } else {
6902     $self->{set_nc}->($self);
6903     }
6904    
6905     return ($self->{ct});
6906     redo A;
6907     } else {
6908     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6909     $self->{state} = ALLOWED_TOKEN_STATE;
6910    
6911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912     $self->{line_prev} = $self->{line};
6913     $self->{column_prev} = $self->{column};
6914     $self->{column}++;
6915     $self->{nc}
6916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917     } else {
6918     $self->{set_nc}->($self);
6919     }
6920    
6921     redo A;
6922     }
6923     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6924     if ($is_space->{$self->{nc}}) {
6925     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6926    
6927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928     $self->{line_prev} = $self->{line};
6929     $self->{column_prev} = $self->{column};
6930     $self->{column}++;
6931     $self->{nc}
6932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933     } else {
6934     $self->{set_nc}->($self);
6935     }
6936    
6937     redo A;
6938     } elsif ($self->{nc} == 0x007C) { # |
6939     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6940    
6941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942     $self->{line_prev} = $self->{line};
6943     $self->{column_prev} = $self->{column};
6944     $self->{column}++;
6945     $self->{nc}
6946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947     } else {
6948     $self->{set_nc}->($self);
6949     }
6950    
6951     redo A;
6952     } elsif ($self->{nc} == 0x0029) { # )
6953     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6954    
6955     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6956     $self->{line_prev} = $self->{line};
6957     $self->{column_prev} = $self->{column};
6958     $self->{column}++;
6959     $self->{nc}
6960     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6961     } else {
6962     $self->{set_nc}->($self);
6963     }
6964    
6965     redo A;
6966     } elsif ($self->{nc} == 0x003E) { # >
6967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6968     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6969    
6970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6971     $self->{line_prev} = $self->{line};
6972     $self->{column_prev} = $self->{column};
6973     $self->{column}++;
6974     $self->{nc}
6975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6976     } else {
6977     $self->{set_nc}->($self);
6978     }
6979    
6980     return ($self->{ct}); # ATTLIST
6981     redo A;
6982     } elsif ($self->{nc} == -1) {
6983     ## XML5: No parse error.
6984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6985     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6986    
6987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6988     $self->{line_prev} = $self->{line};
6989     $self->{column_prev} = $self->{column};
6990     $self->{column}++;
6991     $self->{nc}
6992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6993     } else {
6994     $self->{set_nc}->($self);
6995     }
6996    
6997     return ($self->{ct});
6998     redo A;
6999     } else {
7000     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7001     ## Stay in the state.
7002    
7003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7004     $self->{line_prev} = $self->{line};
7005     $self->{column_prev} = $self->{column};
7006     $self->{column}++;
7007     $self->{nc}
7008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7009     } else {
7010     $self->{set_nc}->($self);
7011     }
7012    
7013     redo A;
7014     }
7015     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7016     if ($is_space->{$self->{nc}}) {
7017     ## Stay in the state.
7018    
7019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020     $self->{line_prev} = $self->{line};
7021     $self->{column_prev} = $self->{column};
7022     $self->{column}++;
7023     $self->{nc}
7024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025     } else {
7026     $self->{set_nc}->($self);
7027     }
7028    
7029     redo A;
7030     } elsif ($self->{nc} == 0x007C) { # |
7031     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7032    
7033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034     $self->{line_prev} = $self->{line};
7035     $self->{column_prev} = $self->{column};
7036     $self->{column}++;
7037     $self->{nc}
7038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039     } else {
7040     $self->{set_nc}->($self);
7041     }
7042    
7043     redo A;
7044     } elsif ($self->{nc} == 0x0029) { # )
7045     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7046    
7047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7048     $self->{line_prev} = $self->{line};
7049     $self->{column_prev} = $self->{column};
7050     $self->{column}++;
7051     $self->{nc}
7052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7053     } else {
7054     $self->{set_nc}->($self);
7055     }
7056    
7057     redo A;
7058     } elsif ($self->{nc} == 0x003E) { # >
7059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7061    
7062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063     $self->{line_prev} = $self->{line};
7064     $self->{column_prev} = $self->{column};
7065     $self->{column}++;
7066     $self->{nc}
7067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068     } else {
7069     $self->{set_nc}->($self);
7070     }
7071    
7072     return ($self->{ct}); # ATTLIST
7073     redo A;
7074     } elsif ($self->{nc} == -1) {
7075     ## XML5: No parse error.
7076     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7077     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7078    
7079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7080     $self->{line_prev} = $self->{line};
7081     $self->{column_prev} = $self->{column};
7082     $self->{column}++;
7083     $self->{nc}
7084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7085     } else {
7086     $self->{set_nc}->($self);
7087     }
7088    
7089     return ($self->{ct});
7090     redo A;
7091     } else {
7092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7093     line => $self->{line_prev},
7094     column => $self->{column_prev});
7095     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7096     $self->{state} = ALLOWED_TOKEN_STATE;
7097    
7098     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7099     $self->{line_prev} = $self->{line};
7100     $self->{column_prev} = $self->{column};
7101     $self->{column}++;
7102     $self->{nc}
7103     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7104     } else {
7105     $self->{set_nc}->($self);
7106     }
7107    
7108     redo A;
7109     }
7110     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7111     if ($is_space->{$self->{nc}}) {
7112     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7113    
7114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115     $self->{line_prev} = $self->{line};
7116     $self->{column_prev} = $self->{column};
7117     $self->{column}++;
7118     $self->{nc}
7119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120     } else {
7121     $self->{set_nc}->($self);
7122     }
7123    
7124     redo A;
7125     } elsif ($self->{nc} == 0x0023) { # #
7126     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7127     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7128    
7129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7130     $self->{line_prev} = $self->{line};
7131     $self->{column_prev} = $self->{column};
7132     $self->{column}++;
7133     $self->{nc}
7134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7135     } else {
7136     $self->{set_nc}->($self);
7137     }
7138    
7139     redo A;
7140     } elsif ($self->{nc} == 0x0022) { # "
7141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7142     $self->{ca}->{value} = '';
7143     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7144    
7145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7146     $self->{line_prev} = $self->{line};
7147     $self->{column_prev} = $self->{column};
7148     $self->{column}++;
7149     $self->{nc}
7150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7151     } else {
7152     $self->{set_nc}->($self);
7153     }
7154    
7155     redo A;
7156     } elsif ($self->{nc} == 0x0027) { # '
7157     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7158     $self->{ca}->{value} = '';
7159     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7160    
7161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7162     $self->{line_prev} = $self->{line};
7163     $self->{column_prev} = $self->{column};
7164     $self->{column}++;
7165     $self->{nc}
7166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7167     } else {
7168     $self->{set_nc}->($self);
7169     }
7170    
7171     redo A;
7172     } elsif ($self->{nc} == 0x003E) { # >
7173     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7175    
7176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177     $self->{line_prev} = $self->{line};
7178     $self->{column_prev} = $self->{column};
7179     $self->{column}++;
7180     $self->{nc}
7181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182     } else {
7183     $self->{set_nc}->($self);
7184     }
7185    
7186     return ($self->{ct}); # ATTLIST
7187     redo A;
7188     } elsif ($self->{nc} == -1) {
7189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7190     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7191    
7192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193     $self->{line_prev} = $self->{line};
7194     $self->{column_prev} = $self->{column};
7195     $self->{column}++;
7196     $self->{nc}
7197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198     } else {
7199     $self->{set_nc}->($self);
7200     }
7201    
7202     return ($self->{ct});
7203     redo A;
7204     } else {
7205     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7206     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7207     ## Reconsume.
7208     redo A;
7209     }
7210     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7211     if ($is_space->{$self->{nc}}) {
7212     ## Stay in the state.
7213    
7214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7215     $self->{line_prev} = $self->{line};
7216     $self->{column_prev} = $self->{column};
7217     $self->{column}++;
7218     $self->{nc}
7219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7220     } else {
7221     $self->{set_nc}->($self);
7222     }
7223    
7224     redo A;
7225     } elsif ($self->{nc} == 0x0023) { # #
7226     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7227    
7228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229     $self->{line_prev} = $self->{line};
7230     $self->{column_prev} = $self->{column};
7231     $self->{column}++;
7232     $self->{nc}
7233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234     } else {
7235     $self->{set_nc}->($self);
7236     }
7237    
7238     redo A;
7239     } elsif ($self->{nc} == 0x0022) { # "
7240     $self->{ca}->{value} = '';
7241     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7242    
7243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244     $self->{line_prev} = $self->{line};
7245     $self->{column_prev} = $self->{column};
7246     $self->{column}++;
7247     $self->{nc}
7248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249     } else {
7250     $self->{set_nc}->($self);
7251     }
7252    
7253     redo A;
7254     } elsif ($self->{nc} == 0x0027) { # '
7255     $self->{ca}->{value} = '';
7256     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7257    
7258     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7259     $self->{line_prev} = $self->{line};
7260     $self->{column_prev} = $self->{column};
7261     $self->{column}++;
7262     $self->{nc}
7263     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7264     } else {
7265     $self->{set_nc}->($self);
7266     }
7267    
7268     redo A;
7269     } elsif ($self->{nc} == 0x003E) { # >
7270     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7272    
7273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7274     $self->{line_prev} = $self->{line};
7275     $self->{column_prev} = $self->{column};
7276     $self->{column}++;
7277     $self->{nc}
7278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7279     } else {
7280     $self->{set_nc}->($self);
7281     }
7282    
7283     return ($self->{ct}); # ATTLIST
7284     redo A;
7285     } elsif ($self->{nc} == -1) {
7286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7287     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7288    
7289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7290     $self->{line_prev} = $self->{line};
7291     $self->{column_prev} = $self->{column};
7292     $self->{column}++;
7293     $self->{nc}
7294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7295     } else {
7296     $self->{set_nc}->($self);
7297     }
7298    
7299     return ($self->{ct});
7300     redo A;
7301     } else {
7302     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7303     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7304     ## Reconsume.
7305     redo A;
7306     }
7307     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7308     if ($is_space->{$self->{nc}}) {
7309     ## XML5: No parse error.
7310     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7311 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7312 wakaba 1.15 ## Reconsume.
7313     redo A;
7314     } elsif ($self->{nc} == 0x0022) { # "
7315     ## XML5: Same as "anything else".
7316     $self->{ca}->{value} = '';
7317     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7318    
7319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7320     $self->{line_prev} = $self->{line};
7321     $self->{column_prev} = $self->{column};
7322     $self->{column}++;
7323     $self->{nc}
7324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7325     } else {
7326     $self->{set_nc}->($self);
7327     }
7328    
7329     redo A;
7330     } elsif ($self->{nc} == 0x0027) { # '
7331     ## XML5: Same as "anything else".
7332     $self->{ca}->{value} = '';
7333     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7334    
7335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7336     $self->{line_prev} = $self->{line};
7337     $self->{column_prev} = $self->{column};
7338     $self->{column}++;
7339     $self->{nc}
7340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7341     } else {
7342     $self->{set_nc}->($self);
7343     }
7344    
7345     redo A;
7346     } elsif ($self->{nc} == 0x003E) { # >
7347     ## XML5: Same as "anything else".
7348     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7349     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7350    
7351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7352     $self->{line_prev} = $self->{line};
7353     $self->{column_prev} = $self->{column};
7354     $self->{column}++;
7355     $self->{nc}
7356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7357     } else {
7358     $self->{set_nc}->($self);
7359     }
7360    
7361     return ($self->{ct}); # ATTLIST
7362     redo A;
7363     } elsif ($self->{nc} == -1) {
7364     ## XML5: No parse error.
7365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7367    
7368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369     $self->{line_prev} = $self->{line};
7370     $self->{column_prev} = $self->{column};
7371     $self->{column}++;
7372     $self->{nc}
7373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374     } else {
7375     $self->{set_nc}->($self);
7376     }
7377    
7378     return ($self->{ct});
7379     redo A;
7380     } else {
7381     $self->{ca}->{default} = chr $self->{nc};
7382     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7383    
7384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385     $self->{line_prev} = $self->{line};
7386     $self->{column_prev} = $self->{column};
7387     $self->{column}++;
7388     $self->{nc}
7389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390     } else {
7391     $self->{set_nc}->($self);
7392     }
7393    
7394     redo A;
7395     }
7396     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7397     if ($is_space->{$self->{nc}}) {
7398     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7399    
7400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401     $self->{line_prev} = $self->{line};
7402     $self->{column_prev} = $self->{column};
7403     $self->{column}++;
7404     $self->{nc}
7405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406     } else {
7407     $self->{set_nc}->($self);
7408     }
7409    
7410     redo A;
7411     } elsif ($self->{nc} == 0x0022) { # "
7412     ## XML5: Same as "anything else".
7413     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7414     $self->{ca}->{value} = '';
7415     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7416    
7417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418     $self->{line_prev} = $self->{line};
7419     $self->{column_prev} = $self->{column};
7420     $self->{column}++;
7421     $self->{nc}
7422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423     } else {
7424     $self->{set_nc}->($self);
7425     }
7426    
7427     redo A;
7428     } elsif ($self->{nc} == 0x0027) { # '
7429     ## XML5: Same as "anything else".
7430     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7431     $self->{ca}->{value} = '';
7432     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7433    
7434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7435     $self->{line_prev} = $self->{line};
7436     $self->{column_prev} = $self->{column};
7437     $self->{column}++;
7438     $self->{nc}
7439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7440     } else {
7441     $self->{set_nc}->($self);
7442     }
7443    
7444     redo A;
7445     } elsif ($self->{nc} == 0x003E) { # >
7446     ## XML5: Same as "anything else".
7447     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7449    
7450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7451     $self->{line_prev} = $self->{line};
7452     $self->{column_prev} = $self->{column};
7453     $self->{column}++;
7454     $self->{nc}
7455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7456     } else {
7457     $self->{set_nc}->($self);
7458     }
7459    
7460     return ($self->{ct}); # ATTLIST
7461     redo A;
7462     } elsif ($self->{nc} == -1) {
7463     ## XML5: No parse error.
7464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7465     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7466     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7467    
7468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7469     $self->{line_prev} = $self->{line};
7470     $self->{column_prev} = $self->{column};
7471     $self->{column}++;
7472     $self->{nc}
7473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7474     } else {
7475     $self->{set_nc}->($self);
7476     }
7477    
7478     return ($self->{ct});
7479     redo A;
7480     } else {
7481     $self->{ca}->{default} .= chr $self->{nc};
7482     ## Stay in the state.
7483    
7484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485     $self->{line_prev} = $self->{line};
7486     $self->{column_prev} = $self->{column};
7487     $self->{column}++;
7488     $self->{nc}
7489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490     } else {
7491     $self->{set_nc}->($self);
7492     }
7493    
7494     redo A;
7495     }
7496     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7497     if ($is_space->{$self->{nc}}) {
7498     ## Stay in the state.
7499    
7500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7501     $self->{line_prev} = $self->{line};
7502     $self->{column_prev} = $self->{column};
7503     $self->{column}++;
7504     $self->{nc}
7505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7506     } else {
7507     $self->{set_nc}->($self);
7508     }
7509    
7510     redo A;
7511     } elsif ($self->{nc} == 0x0022) { # "
7512     $self->{ca}->{value} = '';
7513     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7514    
7515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516     $self->{line_prev} = $self->{line};
7517     $self->{column_prev} = $self->{column};
7518     $self->{column}++;
7519     $self->{nc}
7520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521     } else {
7522     $self->{set_nc}->($self);
7523     }
7524    
7525     redo A;
7526     } elsif ($self->{nc} == 0x0027) { # '
7527     $self->{ca}->{value} = '';
7528     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7529    
7530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531     $self->{line_prev} = $self->{line};
7532     $self->{column_prev} = $self->{column};
7533     $self->{column}++;
7534     $self->{nc}
7535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536     } else {
7537     $self->{set_nc}->($self);
7538     }
7539    
7540     redo A;
7541     } elsif ($self->{nc} == 0x003E) { # >
7542     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7543     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7544    
7545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7546     $self->{line_prev} = $self->{line};
7547     $self->{column_prev} = $self->{column};
7548     $self->{column}++;
7549     $self->{nc}
7550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7551     } else {
7552     $self->{set_nc}->($self);
7553     }
7554    
7555     return ($self->{ct}); # ATTLIST
7556     redo A;
7557     } elsif ($self->{nc} == -1) {
7558     ## XML5: No parse error.
7559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7560     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7561     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7562    
7563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7564     $self->{line_prev} = $self->{line};
7565     $self->{column_prev} = $self->{column};
7566     $self->{column}++;
7567     $self->{nc}
7568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7569     } else {
7570     $self->{set_nc}->($self);
7571     }
7572    
7573     return ($self->{ct});
7574     redo A;
7575     } else {
7576     ## XML5: Not defined yet.
7577     if ($self->{ca}->{default} eq 'FIXED') {
7578     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7579     } else {
7580     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7581     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7582     }
7583     ## Reconsume.
7584     redo A;
7585     }
7586     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7587     if ($is_space->{$self->{nc}} or
7588     $self->{nc} == -1 or
7589     $self->{nc} == 0x003E) { # >
7590     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7591     ## Reconsume.
7592     redo A;
7593     } else {
7594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7595     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7596     ## Reconsume.
7597     redo A;
7598 wakaba 1.16 }
7599 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7600     ## ASCII case-insensitive
7601     if ($self->{nc} == [
7602     undef,
7603     0x0044, # D
7604     0x0041, # A
7605     0x0054, # T
7606     ]->[length $self->{kwd}] or
7607     $self->{nc} == [
7608     undef,
7609     0x0064, # d
7610     0x0061, # a
7611     0x0074, # t
7612     ]->[length $self->{kwd}]) {
7613    
7614     ## Stay in the state.
7615     $self->{kwd} .= chr $self->{nc};
7616    
7617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7618     $self->{line_prev} = $self->{line};
7619     $self->{column_prev} = $self->{column};
7620     $self->{column}++;
7621     $self->{nc}
7622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7623     } else {
7624     $self->{set_nc}->($self);
7625     }
7626    
7627     redo A;
7628     } elsif ((length $self->{kwd}) == 4 and
7629     ($self->{nc} == 0x0041 or # A
7630     $self->{nc} == 0x0061)) { # a
7631     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7632    
7633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7634     text => 'NDATA',
7635     line => $self->{line_prev},
7636     column => $self->{column_prev} - 4);
7637     } else {
7638    
7639     }
7640     $self->{state} = AFTER_NDATA_STATE;
7641    
7642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643     $self->{line_prev} = $self->{line};
7644     $self->{column_prev} = $self->{column};
7645     $self->{column}++;
7646     $self->{nc}
7647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648     } else {
7649     $self->{set_nc}->($self);
7650     }
7651    
7652     redo A;
7653     } else {
7654     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7655     line => $self->{line_prev},
7656     column => $self->{column_prev} + 1
7657     - length $self->{kwd});
7658    
7659     $self->{state} = BOGUS_MD_STATE;
7660     ## Reconsume.
7661     redo A;
7662     }
7663     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7664     if ($is_space->{$self->{nc}}) {
7665     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7666    
7667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7668     $self->{line_prev} = $self->{line};
7669     $self->{column_prev} = $self->{column};
7670     $self->{column}++;
7671     $self->{nc}
7672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7673     } else {
7674     $self->{set_nc}->($self);
7675     }
7676    
7677     redo A;
7678     } elsif ($self->{nc} == 0x003E) { # >
7679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7681    
7682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7683     $self->{line_prev} = $self->{line};
7684     $self->{column_prev} = $self->{column};
7685     $self->{column}++;
7686     $self->{nc}
7687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7688     } else {
7689     $self->{set_nc}->($self);
7690     }
7691    
7692     return ($self->{ct}); # ENTITY
7693     redo A;
7694     } elsif ($self->{nc} == -1) {
7695     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7697    
7698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7699     $self->{line_prev} = $self->{line};
7700     $self->{column_prev} = $self->{column};
7701     $self->{column}++;
7702     $self->{nc}
7703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7704     } else {
7705     $self->{set_nc}->($self);
7706     }
7707    
7708     return ($self->{ct}); # ENTITY
7709     redo A;
7710     } else {
7711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7712     line => $self->{line_prev},
7713     column => $self->{column_prev} + 1
7714     - length $self->{kwd});
7715     $self->{state} = BOGUS_MD_STATE;
7716     ## Reconsume.
7717     redo A;
7718     }
7719     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7720     if ($is_space->{$self->{nc}}) {
7721     ## Stay in the state.
7722    
7723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724     $self->{line_prev} = $self->{line};
7725     $self->{column_prev} = $self->{column};
7726     $self->{column}++;
7727     $self->{nc}
7728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729     } else {
7730     $self->{set_nc}->($self);
7731     }
7732    
7733     redo A;
7734     } elsif ($self->{nc} == 0x003E) { # >
7735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7736     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7737    
7738     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739     $self->{line_prev} = $self->{line};
7740     $self->{column_prev} = $self->{column};
7741     $self->{column}++;
7742     $self->{nc}
7743     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744     } else {
7745     $self->{set_nc}->($self);
7746     }
7747    
7748     return ($self->{ct}); # ENTITY
7749     redo A;
7750     } elsif ($self->{nc} == -1) {
7751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753    
7754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755     $self->{line_prev} = $self->{line};
7756     $self->{column_prev} = $self->{column};
7757     $self->{column}++;
7758     $self->{nc}
7759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760     } else {
7761     $self->{set_nc}->($self);
7762     }
7763    
7764     return ($self->{ct}); # ENTITY
7765     redo A;
7766     } else {
7767     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7768     $self->{state} = NOTATION_NAME_STATE;
7769    
7770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771     $self->{line_prev} = $self->{line};
7772     $self->{column_prev} = $self->{column};
7773     $self->{column}++;
7774     $self->{nc}
7775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776     } else {
7777     $self->{set_nc}->($self);
7778     }
7779    
7780     redo A;
7781     }
7782     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7783     if ($is_space->{$self->{nc}}) {
7784 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7785 wakaba 1.18
7786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787     $self->{line_prev} = $self->{line};
7788     $self->{column_prev} = $self->{column};
7789     $self->{column}++;
7790     $self->{nc}
7791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792     } else {
7793     $self->{set_nc}->($self);
7794     }
7795    
7796     redo A;
7797     } elsif ($self->{nc} == 0x003E) { # >
7798     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7799    
7800     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7801     $self->{line_prev} = $self->{line};
7802     $self->{column_prev} = $self->{column};
7803     $self->{column}++;
7804     $self->{nc}
7805     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7806     } else {
7807     $self->{set_nc}->($self);
7808     }
7809    
7810     return ($self->{ct}); # ENTITY
7811     redo A;
7812     } elsif ($self->{nc} == -1) {
7813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7814     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7815    
7816     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7817     $self->{line_prev} = $self->{line};
7818     $self->{column_prev} = $self->{column};
7819     $self->{column}++;
7820     $self->{nc}
7821     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7822     } else {
7823     $self->{set_nc}->($self);
7824     }
7825    
7826     return ($self->{ct}); # ENTITY
7827     redo A;
7828     } else {
7829     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7830     ## Stay in the state.
7831    
7832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7833     $self->{line_prev} = $self->{line};
7834     $self->{column_prev} = $self->{column};
7835     $self->{column}++;
7836     $self->{nc}
7837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7838     } else {
7839     $self->{set_nc}->($self);
7840     }
7841    
7842     redo A;
7843     }
7844 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7845     if ($self->{nc} == 0x0022) { # "
7846 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7847 wakaba 1.19
7848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7849     $self->{line_prev} = $self->{line};
7850     $self->{column_prev} = $self->{column};
7851     $self->{column}++;
7852     $self->{nc}
7853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7854     } else {
7855     $self->{set_nc}->($self);
7856     }
7857    
7858     redo A;
7859     } elsif ($self->{nc} == 0x0026) { # &
7860     $self->{prev_state} = $self->{state};
7861     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7862     $self->{entity_add} = 0x0022; # "
7863    
7864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7865     $self->{line_prev} = $self->{line};
7866     $self->{column_prev} = $self->{column};
7867     $self->{column}++;
7868     $self->{nc}
7869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7870     } else {
7871     $self->{set_nc}->($self);
7872     }
7873    
7874     redo A;
7875     ## TODO: %
7876     } elsif ($self->{nc} == -1) {
7877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7878     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7879     ## Reconsume.
7880     return ($self->{ct}); # ENTITY
7881     redo A;
7882     } else {
7883     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7884    
7885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7886     $self->{line_prev} = $self->{line};
7887     $self->{column_prev} = $self->{column};
7888     $self->{column}++;
7889     $self->{nc}
7890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7891     } else {
7892     $self->{set_nc}->($self);
7893     }
7894    
7895     redo A;
7896     }
7897     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7898     if ($self->{nc} == 0x0027) { # '
7899 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7900 wakaba 1.19
7901     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7902     $self->{line_prev} = $self->{line};
7903     $self->{column_prev} = $self->{column};
7904     $self->{column}++;
7905     $self->{nc}
7906     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7907     } else {
7908     $self->{set_nc}->($self);
7909     }
7910    
7911     redo A;
7912     } elsif ($self->{nc} == 0x0026) { # &
7913     $self->{prev_state} = $self->{state};
7914     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7915     $self->{entity_add} = 0x0027; # '
7916    
7917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7918     $self->{line_prev} = $self->{line};
7919     $self->{column_prev} = $self->{column};
7920     $self->{column}++;
7921     $self->{nc}
7922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7923     } else {
7924     $self->{set_nc}->($self);
7925     }
7926    
7927     redo A;
7928     ## TODO: %
7929     } elsif ($self->{nc} == -1) {
7930     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7931     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7932     ## Reconsume.
7933     return ($self->{ct}); # ENTITY
7934     redo A;
7935     } else {
7936     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7937    
7938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7939     $self->{line_prev} = $self->{line};
7940     $self->{column_prev} = $self->{column};
7941     $self->{column}++;
7942     $self->{nc}
7943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7944     } else {
7945     $self->{set_nc}->($self);
7946     }
7947    
7948     redo A;
7949     }
7950     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7951     if ($is_space->{$self->{nc}} or
7952     {
7953     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7954     $self->{entity_add} => 1,
7955     }->{$self->{nc}}) {
7956 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7957     line => $self->{line_prev},
7958     column => $self->{column_prev}
7959     + ($self->{nc} == -1 ? 1 : 0));
7960 wakaba 1.19 ## Don't consume
7961     ## Return nothing.
7962     #
7963     } elsif ($self->{nc} == 0x0023) { # #
7964     $self->{ca} = $self->{ct};
7965     $self->{state} = ENTITY_HASH_STATE;
7966     $self->{kwd} = '#';
7967    
7968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7969     $self->{line_prev} = $self->{line};
7970     $self->{column_prev} = $self->{column};
7971     $self->{column}++;
7972     $self->{nc}
7973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7974     } else {
7975     $self->{set_nc}->($self);
7976     }
7977    
7978     redo A;
7979     } else {
7980     #
7981     }
7982    
7983     $self->{ct}->{value} .= '&';
7984     $self->{state} = $self->{prev_state};
7985     ## Reconsume.
7986     redo A;
7987 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7988     if ($is_space->{$self->{nc}}) {
7989     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7990    
7991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7992     $self->{line_prev} = $self->{line};
7993     $self->{column_prev} = $self->{column};
7994     $self->{column}++;
7995     $self->{nc}
7996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7997     } else {
7998     $self->{set_nc}->($self);
7999     }
8000    
8001     redo A;
8002     } elsif ($self->{nc} == 0x0028) { # (
8003     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8004     $self->{ct}->{content} = ['('];
8005     $self->{group_depth} = 1;
8006    
8007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8008     $self->{line_prev} = $self->{line};
8009     $self->{column_prev} = $self->{column};
8010     $self->{column}++;
8011     $self->{nc}
8012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8013     } else {
8014     $self->{set_nc}->($self);
8015     }
8016    
8017     redo A;
8018     } elsif ($self->{nc} == 0x003E) { # >
8019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8020     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8021    
8022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8023     $self->{line_prev} = $self->{line};
8024     $self->{column_prev} = $self->{column};
8025     $self->{column}++;
8026     $self->{nc}
8027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8028     } else {
8029     $self->{set_nc}->($self);
8030     }
8031    
8032     return ($self->{ct}); # ELEMENT
8033     redo A;
8034     } elsif ($self->{nc} == -1) {
8035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8036     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8037    
8038     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039     $self->{line_prev} = $self->{line};
8040     $self->{column_prev} = $self->{column};
8041     $self->{column}++;
8042     $self->{nc}
8043     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044     } else {
8045     $self->{set_nc}->($self);
8046     }
8047    
8048     return ($self->{ct}); # ELEMENT
8049     redo A;
8050     } else {
8051     $self->{ct}->{content} = [chr $self->{nc}];
8052     $self->{state} = CONTENT_KEYWORD_STATE;
8053    
8054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055     $self->{line_prev} = $self->{line};
8056     $self->{column_prev} = $self->{column};
8057     $self->{column}++;
8058     $self->{nc}
8059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060     } else {
8061     $self->{set_nc}->($self);
8062     }
8063    
8064     redo A;
8065     }
8066     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8067     if ($is_space->{$self->{nc}}) {
8068     $self->{state} = AFTER_MD_DEF_STATE;
8069    
8070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071     $self->{line_prev} = $self->{line};
8072     $self->{column_prev} = $self->{column};
8073     $self->{column}++;
8074     $self->{nc}
8075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076     } else {
8077     $self->{set_nc}->($self);
8078     }
8079    
8080     redo A;
8081     } elsif ($self->{nc} == 0x003E) { # >
8082     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8083    
8084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8085     $self->{line_prev} = $self->{line};
8086     $self->{column_prev} = $self->{column};
8087     $self->{column}++;
8088     $self->{nc}
8089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8090     } else {
8091     $self->{set_nc}->($self);
8092     }
8093    
8094     return ($self->{ct}); # ELEMENT
8095     redo A;
8096     } elsif ($self->{nc} == -1) {
8097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8099    
8100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8101     $self->{line_prev} = $self->{line};
8102     $self->{column_prev} = $self->{column};
8103     $self->{column}++;
8104     $self->{nc}
8105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8106     } else {
8107     $self->{set_nc}->($self);
8108     }
8109    
8110     return ($self->{ct}); # ELEMENT
8111     redo A;
8112     } else {
8113     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8114     ## Stay in the state.
8115    
8116     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8117     $self->{line_prev} = $self->{line};
8118     $self->{column_prev} = $self->{column};
8119     $self->{column}++;
8120     $self->{nc}
8121     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8122     } else {
8123     $self->{set_nc}->($self);
8124     }
8125    
8126     redo A;
8127     }
8128     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8129     if ($is_space->{$self->{nc}}) {
8130     ## Stay in the state.
8131    
8132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8133     $self->{line_prev} = $self->{line};
8134     $self->{column_prev} = $self->{column};
8135     $self->{column}++;
8136     $self->{nc}
8137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8138     } else {
8139     $self->{set_nc}->($self);
8140     }
8141    
8142     redo A;
8143     } elsif ($self->{nc} == 0x0028) { # (
8144     $self->{group_depth}++;
8145     push @{$self->{ct}->{content}}, chr $self->{nc};
8146     ## Stay in the state.
8147    
8148     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8149     $self->{line_prev} = $self->{line};
8150     $self->{column_prev} = $self->{column};
8151     $self->{column}++;
8152     $self->{nc}
8153     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8154     } else {
8155     $self->{set_nc}->($self);
8156     }
8157    
8158     redo A;
8159     } elsif ($self->{nc} == 0x007C or # |
8160     $self->{nc} == 0x002C) { # ,
8161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8162     ## Stay in the state.
8163    
8164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165     $self->{line_prev} = $self->{line};
8166     $self->{column_prev} = $self->{column};
8167     $self->{column}++;
8168     $self->{nc}
8169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170     } else {
8171     $self->{set_nc}->($self);
8172     }
8173    
8174     redo A;
8175     } elsif ($self->{nc} == 0x0029) { # )
8176     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8177     push @{$self->{ct}->{content}}, chr $self->{nc};
8178     $self->{group_depth}--;
8179     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8180    
8181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182     $self->{line_prev} = $self->{line};
8183     $self->{column_prev} = $self->{column};
8184     $self->{column}++;
8185     $self->{nc}
8186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187     } else {
8188     $self->{set_nc}->($self);
8189     }
8190    
8191     redo A;
8192     } elsif ($self->{nc} == 0x003E) { # >
8193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8194     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8195     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8196    
8197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198     $self->{line_prev} = $self->{line};
8199     $self->{column_prev} = $self->{column};
8200     $self->{column}++;
8201     $self->{nc}
8202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203     } else {
8204     $self->{set_nc}->($self);
8205     }
8206    
8207     return ($self->{ct}); # ELEMENT
8208     redo A;
8209     } elsif ($self->{nc} == -1) {
8210     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8211     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8212     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8213    
8214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8215     $self->{line_prev} = $self->{line};
8216     $self->{column_prev} = $self->{column};
8217     $self->{column}++;
8218     $self->{nc}
8219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8220     } else {
8221     $self->{set_nc}->($self);
8222     }
8223    
8224     return ($self->{ct}); # ELEMENT
8225     redo A;
8226     } else {
8227     push @{$self->{ct}->{content}}, chr $self->{nc};
8228     $self->{state} = CM_ELEMENT_NAME_STATE;
8229    
8230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231     $self->{line_prev} = $self->{line};
8232     $self->{column_prev} = $self->{column};
8233     $self->{column}++;
8234     $self->{nc}
8235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236     } else {
8237     $self->{set_nc}->($self);
8238     }
8239    
8240     redo A;
8241     }
8242     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8243     if ($is_space->{$self->{nc}}) {
8244     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8245    
8246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247     $self->{line_prev} = $self->{line};
8248     $self->{column_prev} = $self->{column};
8249     $self->{column}++;
8250     $self->{nc}
8251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252     } else {
8253     $self->{set_nc}->($self);
8254     }
8255    
8256     redo A;
8257     } elsif ($self->{nc} == 0x002A or # *
8258     $self->{nc} == 0x002B or # +
8259     $self->{nc} == 0x003F) { # ?
8260     push @{$self->{ct}->{content}}, chr $self->{nc};
8261     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8262    
8263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264     $self->{line_prev} = $self->{line};
8265     $self->{column_prev} = $self->{column};
8266     $self->{column}++;
8267     $self->{nc}
8268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269     } else {
8270     $self->{set_nc}->($self);
8271     }
8272    
8273     redo A;
8274     } elsif ($self->{nc} == 0x007C or # |
8275     $self->{nc} == 0x002C) { # ,
8276     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8277     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8278    
8279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280     $self->{line_prev} = $self->{line};
8281     $self->{column_prev} = $self->{column};
8282     $self->{column}++;
8283     $self->{nc}
8284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285     } else {
8286     $self->{set_nc}->($self);
8287     }
8288    
8289     redo A;
8290     } elsif ($self->{nc} == 0x0029) { # )
8291     $self->{group_depth}--;
8292     push @{$self->{ct}->{content}}, chr $self->{nc};
8293     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8294    
8295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296     $self->{line_prev} = $self->{line};
8297     $self->{column_prev} = $self->{column};
8298     $self->{column}++;
8299     $self->{nc}
8300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301     } else {
8302     $self->{set_nc}->($self);
8303     }
8304    
8305     redo A;
8306     } elsif ($self->{nc} == 0x003E) { # >
8307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8308     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8309     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8310    
8311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8312     $self->{line_prev} = $self->{line};
8313     $self->{column_prev} = $self->{column};
8314     $self->{column}++;
8315     $self->{nc}
8316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8317     } else {
8318     $self->{set_nc}->($self);
8319     }
8320    
8321     return ($self->{ct}); # ELEMENT
8322     redo A;
8323     } elsif ($self->{nc} == -1) {
8324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8325     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8326     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8327    
8328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329     $self->{line_prev} = $self->{line};
8330     $self->{column_prev} = $self->{column};
8331     $self->{column}++;
8332     $self->{nc}
8333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334     } else {
8335     $self->{set_nc}->($self);
8336     }
8337    
8338     return ($self->{ct}); # ELEMENT
8339     redo A;
8340     } else {
8341     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8342     ## Stay in the state.
8343    
8344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345     $self->{line_prev} = $self->{line};
8346     $self->{column_prev} = $self->{column};
8347     $self->{column}++;
8348     $self->{nc}
8349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350     } else {
8351     $self->{set_nc}->($self);
8352     }
8353    
8354     redo A;
8355     }
8356     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8357     if ($is_space->{$self->{nc}}) {
8358     ## Stay in the state.
8359    
8360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361     $self->{line_prev} = $self->{line};
8362     $self->{column_prev} = $self->{column};
8363     $self->{column}++;
8364     $self->{nc}
8365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366     } else {
8367     $self->{set_nc}->($self);
8368     }
8369    
8370     redo A;
8371     } elsif ($self->{nc} == 0x007C or # |
8372     $self->{nc} == 0x002C) { # ,
8373     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8374     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8375    
8376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8377     $self->{line_prev} = $self->{line};
8378     $self->{column_prev} = $self->{column};
8379     $self->{column}++;
8380     $self->{nc}
8381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8382     } else {
8383     $self->{set_nc}->($self);
8384     }
8385    
8386     redo A;
8387     } elsif ($self->{nc} == 0x0029) { # )
8388     $self->{group_depth}--;
8389     push @{$self->{ct}->{content}}, chr $self->{nc};
8390     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8391    
8392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8393     $self->{line_prev} = $self->{line};
8394     $self->{column_prev} = $self->{column};
8395     $self->{column}++;
8396     $self->{nc}
8397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8398     } else {
8399     $self->{set_nc}->($self);
8400     }
8401    
8402     redo A;
8403     } elsif ($self->{nc} == 0x003E) { # >
8404     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8405     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8406     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8407    
8408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8409     $self->{line_prev} = $self->{line};
8410     $self->{column_prev} = $self->{column};
8411     $self->{column}++;
8412     $self->{nc}
8413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8414     } else {
8415     $self->{set_nc}->($self);
8416     }
8417    
8418     return ($self->{ct}); # ELEMENT
8419     redo A;
8420     } elsif ($self->{nc} == -1) {
8421     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8422     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8423     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8424    
8425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426     $self->{line_prev} = $self->{line};
8427     $self->{column_prev} = $self->{column};
8428     $self->{column}++;
8429     $self->{nc}
8430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431     } else {
8432     $self->{set_nc}->($self);
8433     }
8434    
8435     return ($self->{ct}); # ELEMENT
8436     redo A;
8437     } else {
8438     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8439     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8440     $self->{state} = BOGUS_MD_STATE;
8441    
8442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8443     $self->{line_prev} = $self->{line};
8444     $self->{column_prev} = $self->{column};
8445     $self->{column}++;
8446     $self->{nc}
8447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8448     } else {
8449     $self->{set_nc}->($self);
8450     }
8451    
8452     redo A;
8453     }
8454     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8455     if ($is_space->{$self->{nc}}) {
8456     if ($self->{group_depth}) {
8457     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8458     } else {
8459     $self->{state} = AFTER_MD_DEF_STATE;
8460     }
8461    
8462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8463     $self->{line_prev} = $self->{line};
8464     $self->{column_prev} = $self->{column};
8465     $self->{column}++;
8466     $self->{nc}
8467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8468     } else {
8469     $self->{set_nc}->($self);
8470     }
8471    
8472     redo A;
8473     } elsif ($self->{nc} == 0x002A or # *
8474     $self->{nc} == 0x002B or # +
8475     $self->{nc} == 0x003F) { # ?
8476     push @{$self->{ct}->{content}}, chr $self->{nc};
8477     if ($self->{group_depth}) {
8478     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8479     } else {
8480     $self->{state} = AFTER_MD_DEF_STATE;
8481     }
8482    
8483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8484     $self->{line_prev} = $self->{line};
8485     $self->{column_prev} = $self->{column};
8486     $self->{column}++;
8487     $self->{nc}
8488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8489     } else {
8490     $self->{set_nc}->($self);
8491     }
8492    
8493     redo A;
8494     } elsif ($self->{nc} == 0x0029) { # )
8495     if ($self->{group_depth}) {
8496     $self->{group_depth}--;
8497     push @{$self->{ct}->{content}}, chr $self->{nc};
8498     ## Stay in the state.
8499    
8500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8501     $self->{line_prev} = $self->{line};
8502     $self->{column_prev} = $self->{column};
8503     $self->{column}++;
8504     $self->{nc}
8505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8506     } else {
8507     $self->{set_nc}->($self);
8508     }
8509    
8510     redo A;
8511     } else {
8512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8513     $self->{state} = BOGUS_MD_STATE;
8514     ## Reconsume.
8515     redo A;
8516     }
8517     } elsif ($self->{nc} == 0x003E) { # >
8518     if ($self->{group_depth}) {
8519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8520     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8521     }
8522     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8523    
8524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8525     $self->{line_prev} = $self->{line};
8526     $self->{column_prev} = $self->{column};
8527     $self->{column}++;
8528     $self->{nc}
8529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8530     } else {
8531     $self->{set_nc}->($self);
8532     }
8533    
8534     return ($self->{ct}); # ELEMENT
8535     redo A;
8536     } elsif ($self->{nc} == -1) {
8537     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8538     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8540    
8541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8542     $self->{line_prev} = $self->{line};
8543     $self->{column_prev} = $self->{column};
8544     $self->{column}++;
8545     $self->{nc}
8546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8547     } else {
8548     $self->{set_nc}->($self);
8549     }
8550    
8551     return ($self->{ct}); # ELEMENT
8552     redo A;
8553     } else {
8554     if ($self->{group_depth}) {
8555     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8556     } else {
8557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8558     $self->{state} = BOGUS_MD_STATE;
8559     }
8560     ## Reconsume.
8561     redo A;
8562     }
8563     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8564 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8565     ## Stay in the state.
8566    
8567     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8568     $self->{line_prev} = $self->{line};
8569     $self->{column_prev} = $self->{column};
8570     $self->{column}++;
8571     $self->{nc}
8572     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8573     } else {
8574     $self->{set_nc}->($self);
8575     }
8576    
8577     redo A;
8578     } elsif ($self->{nc} == 0x003E) { # >
8579     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580    
8581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582     $self->{line_prev} = $self->{line};
8583     $self->{column_prev} = $self->{column};
8584     $self->{column}++;
8585     $self->{nc}
8586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587     } else {
8588     $self->{set_nc}->($self);
8589     }
8590    
8591 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8592 wakaba 1.18 redo A;
8593     } elsif ($self->{nc} == -1) {
8594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8596    
8597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8598     $self->{line_prev} = $self->{line};
8599     $self->{column_prev} = $self->{column};
8600     $self->{column}++;
8601     $self->{nc}
8602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8603     } else {
8604     $self->{set_nc}->($self);
8605     }
8606    
8607 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8608 wakaba 1.18 redo A;
8609     } else {
8610 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8611 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8612     ## Reconsume.
8613     redo A;
8614     }
8615 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8616     if ($self->{nc} == 0x003E) { # >
8617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8618    
8619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8620     $self->{line_prev} = $self->{line};
8621     $self->{column_prev} = $self->{column};
8622     $self->{column}++;
8623     $self->{nc}
8624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8625     } else {
8626     $self->{set_nc}->($self);
8627     }
8628    
8629     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8630     redo A;
8631     } elsif ($self->{nc} == -1) {
8632     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633     ## Reconsume.
8634     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8635     redo A;
8636     } else {
8637     ## Stay in the state.
8638    
8639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8640     $self->{line_prev} = $self->{line};
8641     $self->{column_prev} = $self->{column};
8642     $self->{column}++;
8643     $self->{nc}
8644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8645     } else {
8646     $self->{set_nc}->($self);
8647     }
8648    
8649     redo A;
8650     }
8651 wakaba 1.1 } else {
8652     die "$0: $self->{state}: Unknown state";
8653     }
8654     } # A
8655    
8656     die "$0: _get_next_token: unexpected case";
8657     } # _get_next_token
8658    
8659     1;
8660 wakaba 1.25 ## $Date: 2008/10/19 14:05:20 $
8661 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24