/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (hide annotations) (download)
Sun Oct 19 08:20:29 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.19: +598 -16 lines
++ whatpm/t/ChangeLog	19 Oct 2008 08:20:14 -0000
	* XML-Parser.t: "xml/eldecls-1.dat" added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 08:17:10 -0000
	* eldecls-1.dat: New test data file.

	* entities-1.dat: Test result updated.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 08:18:56 -0000
	* Tokenizer.pm.src: Support for <!ELEMENT>.
	(AFTER_NOTATION_NAME_STATE): Renamed as |AFTER_MD_DEF_STATE| (i.e.
	after markup declaration definition state).

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 08:19:52 -0000
	* Parser.pm.src: Set |content_model_text| attribute for
	ElementTypeDefinition nodes.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.20 our $VERSION=do{my @r=(q$Revision: 1.19 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743     if ($self->{nc} == 0x003D) { # =
1744    
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819     } elsif ($self->{nc} == -1) {
1820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822    
1823     $self->{last_stag_name} = $self->{ct}->{tag_name};
1824 wakaba 1.15
1825     $self->{state} = DATA_STATE;
1826     $self->{s_kwd} = '';
1827     ## reconsume
1828     return ($self->{ct}); # start tag
1829     redo A;
1830 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832     if ($self->{ct}->{attributes}) {
1833    
1834     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835     } else {
1836     ## NOTE: This state should never be reached.
1837    
1838     }
1839 wakaba 1.15
1840     $self->{state} = DATA_STATE;
1841     $self->{s_kwd} = '';
1842     ## reconsume
1843     return ($self->{ct}); # end tag
1844     redo A;
1845     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846     ## XML5: No parse error above; not defined yet.
1847     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849     ## Reconsume.
1850     return ($self->{ct}); # ATTLIST
1851     redo A;
1852 wakaba 1.1 } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     } else {
1856 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1857 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858    
1859     ## XML5: Not a parse error.
1860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861     } else {
1862    
1863     }
1864 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1865     $self->{read_until}->($self->{ca}->{value},
1866 wakaba 1.11 q["&<],
1867 wakaba 1.1 length $self->{ca}->{value});
1868    
1869     ## Stay in the state
1870    
1871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872     $self->{line_prev} = $self->{line};
1873     $self->{column_prev} = $self->{column};
1874     $self->{column}++;
1875     $self->{nc}
1876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877     } else {
1878     $self->{set_nc}->($self);
1879     }
1880    
1881     redo A;
1882     }
1883     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885     ## ATTLIST attribute value single quoted state".
1886 wakaba 1.11
1887 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1888 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889    
1890     ## XML5: "DOCTYPE ATTLIST name after state".
1891     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893     } else {
1894    
1895     ## XML5: "Before attribute name state" (sic).
1896     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897     }
1898 wakaba 1.1
1899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900     $self->{line_prev} = $self->{line};
1901     $self->{column_prev} = $self->{column};
1902     $self->{column}++;
1903     $self->{nc}
1904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905     } else {
1906     $self->{set_nc}->($self);
1907     }
1908    
1909     redo A;
1910     } elsif ($self->{nc} == 0x0026) { # &
1911    
1912 wakaba 1.11 ## XML5: Not defined yet.
1913    
1914 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1915     ## "entity in attribute value state". In this implementation, the
1916     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917     ## implementation of the "consume a character reference" algorithm.
1918     $self->{entity_add} = 0x0027; # '
1919     $self->{prev_state} = $self->{state};
1920     $self->{state} = ENTITY_STATE;
1921    
1922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923     $self->{line_prev} = $self->{line};
1924     $self->{column_prev} = $self->{column};
1925     $self->{column}++;
1926     $self->{nc}
1927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928     } else {
1929     $self->{set_nc}->($self);
1930     }
1931    
1932     redo A;
1933     } elsif ($self->{nc} == -1) {
1934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936    
1937     $self->{last_stag_name} = $self->{ct}->{tag_name};
1938 wakaba 1.15
1939     $self->{state} = DATA_STATE;
1940     $self->{s_kwd} = '';
1941     ## reconsume
1942     return ($self->{ct}); # start tag
1943     redo A;
1944 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946     if ($self->{ct}->{attributes}) {
1947    
1948     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949     } else {
1950     ## NOTE: This state should never be reached.
1951    
1952     }
1953 wakaba 1.15
1954     $self->{state} = DATA_STATE;
1955     $self->{s_kwd} = '';
1956     ## reconsume
1957     return ($self->{ct}); # end tag
1958     redo A;
1959     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960     ## XML5: No parse error above; not defined yet.
1961     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963     ## Reconsume.
1964     return ($self->{ct}); # ATTLIST
1965     redo A;
1966 wakaba 1.1 } else {
1967     die "$0: $self->{ct}->{type}: Unknown token type";
1968     }
1969     } else {
1970 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1971 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972    
1973     ## XML5: Not a parse error.
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975     } else {
1976    
1977     }
1978 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1979     $self->{read_until}->($self->{ca}->{value},
1980 wakaba 1.11 q['&<],
1981 wakaba 1.1 length $self->{ca}->{value});
1982    
1983     ## Stay in the state
1984    
1985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986     $self->{line_prev} = $self->{line};
1987     $self->{column_prev} = $self->{column};
1988     $self->{column}++;
1989     $self->{nc}
1990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991     } else {
1992     $self->{set_nc}->($self);
1993     }
1994    
1995     redo A;
1996     }
1997     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1999    
2000 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2001 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002    
2003     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005     } else {
2006    
2007     ## XML5: "Tag attribute name before state".
2008     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009     }
2010 wakaba 1.1
2011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012     $self->{line_prev} = $self->{line};
2013     $self->{column_prev} = $self->{column};
2014     $self->{column}++;
2015     $self->{nc}
2016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017     } else {
2018     $self->{set_nc}->($self);
2019     }
2020    
2021     redo A;
2022     } elsif ($self->{nc} == 0x0026) { # &
2023    
2024 wakaba 1.11
2025     ## XML5: Not defined yet.
2026    
2027 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2028     ## "entity in attribute value state". In this implementation, the
2029     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030     ## implementation of the "consume a character reference" algorithm.
2031     $self->{entity_add} = -1;
2032     $self->{prev_state} = $self->{state};
2033     $self->{state} = ENTITY_STATE;
2034    
2035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036     $self->{line_prev} = $self->{line};
2037     $self->{column_prev} = $self->{column};
2038     $self->{column}++;
2039     $self->{nc}
2040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041     } else {
2042     $self->{set_nc}->($self);
2043     }
2044    
2045     redo A;
2046     } elsif ($self->{nc} == 0x003E) { # >
2047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048    
2049     $self->{last_stag_name} = $self->{ct}->{tag_name};
2050 wakaba 1.15
2051     $self->{state} = DATA_STATE;
2052     $self->{s_kwd} = '';
2053    
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     return ($self->{ct}); # start tag
2065     redo A;
2066 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068     if ($self->{ct}->{attributes}) {
2069    
2070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071     } else {
2072     ## NOTE: This state should never be reached.
2073    
2074     }
2075 wakaba 1.15
2076     $self->{state} = DATA_STATE;
2077     $self->{s_kwd} = '';
2078    
2079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080     $self->{line_prev} = $self->{line};
2081     $self->{column_prev} = $self->{column};
2082     $self->{column}++;
2083     $self->{nc}
2084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085     } else {
2086     $self->{set_nc}->($self);
2087     }
2088    
2089     return ($self->{ct}); # end tag
2090     redo A;
2091     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094    
2095 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096     $self->{line_prev} = $self->{line};
2097     $self->{column_prev} = $self->{column};
2098     $self->{column}++;
2099     $self->{nc}
2100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101     } else {
2102     $self->{set_nc}->($self);
2103     }
2104    
2105 wakaba 1.15 return ($self->{ct}); # ATTLIST
2106     redo A;
2107     } else {
2108     die "$0: $self->{ct}->{type}: Unknown token type";
2109     }
2110 wakaba 1.1 } elsif ($self->{nc} == -1) {
2111     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112    
2113 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115 wakaba 1.15
2116     $self->{state} = DATA_STATE;
2117     $self->{s_kwd} = '';
2118     ## reconsume
2119     return ($self->{ct}); # start tag
2120     redo A;
2121 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124     if ($self->{ct}->{attributes}) {
2125    
2126     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127     } else {
2128     ## NOTE: This state should never be reached.
2129    
2130     }
2131 wakaba 1.15
2132     $self->{state} = DATA_STATE;
2133     $self->{s_kwd} = '';
2134     ## reconsume
2135     return ($self->{ct}); # end tag
2136     redo A;
2137     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141     ## Reconsume.
2142     return ($self->{ct}); # ATTLIST
2143     redo A;
2144 wakaba 1.1 } else {
2145     die "$0: $self->{ct}->{type}: Unknown token type";
2146     }
2147     } else {
2148     if ({
2149     0x0022 => 1, # "
2150     0x0027 => 1, # '
2151     0x003D => 1, # =
2152     }->{$self->{nc}}) {
2153    
2154 wakaba 1.11 ## XML5: Not a parse error.
2155 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156     } else {
2157    
2158     }
2159     $self->{ca}->{value} .= chr ($self->{nc});
2160     $self->{read_until}->($self->{ca}->{value},
2161     q["'=& >],
2162     length $self->{ca}->{value});
2163    
2164     ## Stay in the state
2165    
2166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167     $self->{line_prev} = $self->{line};
2168     $self->{column_prev} = $self->{column};
2169     $self->{column}++;
2170     $self->{nc}
2171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172     } else {
2173     $self->{set_nc}->($self);
2174     }
2175    
2176     redo A;
2177     }
2178     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179     if ($is_space->{$self->{nc}}) {
2180    
2181     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182    
2183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184     $self->{line_prev} = $self->{line};
2185     $self->{column_prev} = $self->{column};
2186     $self->{column}++;
2187     $self->{nc}
2188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189     } else {
2190     $self->{set_nc}->($self);
2191     }
2192    
2193     redo A;
2194     } elsif ($self->{nc} == 0x003E) { # >
2195     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196    
2197     $self->{last_stag_name} = $self->{ct}->{tag_name};
2198     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200     if ($self->{ct}->{attributes}) {
2201    
2202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203     } else {
2204     ## NOTE: This state should never be reached.
2205    
2206     }
2207     } else {
2208     die "$0: $self->{ct}->{type}: Unknown token type";
2209     }
2210     $self->{state} = DATA_STATE;
2211 wakaba 1.5 $self->{s_kwd} = '';
2212 wakaba 1.1
2213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214     $self->{line_prev} = $self->{line};
2215     $self->{column_prev} = $self->{column};
2216     $self->{column}++;
2217     $self->{nc}
2218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219     } else {
2220     $self->{set_nc}->($self);
2221     }
2222    
2223    
2224     return ($self->{ct}); # start tag or end tag
2225    
2226     redo A;
2227     } elsif ($self->{nc} == 0x002F) { # /
2228    
2229     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230    
2231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232     $self->{line_prev} = $self->{line};
2233     $self->{column_prev} = $self->{column};
2234     $self->{column}++;
2235     $self->{nc}
2236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237     } else {
2238     $self->{set_nc}->($self);
2239     }
2240    
2241     redo A;
2242     } elsif ($self->{nc} == -1) {
2243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245    
2246     $self->{last_stag_name} = $self->{ct}->{tag_name};
2247     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248     if ($self->{ct}->{attributes}) {
2249    
2250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251     } else {
2252     ## NOTE: This state should never be reached.
2253    
2254     }
2255     } else {
2256     die "$0: $self->{ct}->{type}: Unknown token type";
2257     }
2258     $self->{state} = DATA_STATE;
2259 wakaba 1.5 $self->{s_kwd} = '';
2260 wakaba 1.1 ## Reconsume.
2261     return ($self->{ct}); # start tag or end tag
2262     redo A;
2263     } else {
2264    
2265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267     ## reconsume
2268     redo A;
2269     }
2270     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 wakaba 1.11 ## XML5: "Empty tag state".
2272    
2273 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2274     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275    
2276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277     ## TODO: Different type than slash in start tag
2278     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279     if ($self->{ct}->{attributes}) {
2280    
2281     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282     } else {
2283    
2284     }
2285     ## TODO: Test |<title></title/>|
2286     } else {
2287    
2288     $self->{self_closing} = 1;
2289     }
2290    
2291     $self->{state} = DATA_STATE;
2292 wakaba 1.5 $self->{s_kwd} = '';
2293 wakaba 1.1
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304    
2305     return ($self->{ct}); # start tag or end tag
2306    
2307     redo A;
2308     } elsif ($self->{nc} == -1) {
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311    
2312     $self->{last_stag_name} = $self->{ct}->{tag_name};
2313     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318     ## NOTE: This state should never be reached.
2319    
2320     }
2321     } else {
2322     die "$0: $self->{ct}->{type}: Unknown token type";
2323     }
2324 wakaba 1.11 ## XML5: "Tag attribute name before state".
2325 wakaba 1.1 $self->{state} = DATA_STATE;
2326 wakaba 1.5 $self->{s_kwd} = '';
2327 wakaba 1.1 ## Reconsume.
2328     return ($self->{ct}); # start tag or end tag
2329     redo A;
2330     } else {
2331    
2332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333     ## TODO: This error type is wrong.
2334     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335     ## Reconsume.
2336     redo A;
2337     }
2338     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340    
2341 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342     ## consumes characters one-by-one basis.
2343    
2344     if ($self->{nc} == 0x003E) { # >
2345 wakaba 1.13 if ($self->{in_subset}) {
2346    
2347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348     } else {
2349    
2350     $self->{state} = DATA_STATE;
2351     $self->{s_kwd} = '';
2352     }
2353 wakaba 1.1
2354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355     $self->{line_prev} = $self->{line};
2356     $self->{column_prev} = $self->{column};
2357     $self->{column}++;
2358     $self->{nc}
2359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360     } else {
2361     $self->{set_nc}->($self);
2362     }
2363    
2364    
2365     return ($self->{ct}); # comment
2366     redo A;
2367     } elsif ($self->{nc} == -1) {
2368 wakaba 1.13 if ($self->{in_subset}) {
2369    
2370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371     } else {
2372    
2373     $self->{state} = DATA_STATE;
2374     $self->{s_kwd} = '';
2375     }
2376 wakaba 1.1 ## reconsume
2377    
2378     return ($self->{ct}); # comment
2379     redo A;
2380     } else {
2381    
2382     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383     $self->{read_until}->($self->{ct}->{data},
2384     q[>],
2385     length $self->{ct}->{data});
2386    
2387     ## Stay in the state.
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399     redo A;
2400     }
2401     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 wakaba 1.14 ## XML5: "Markup declaration state".
2403 wakaba 1.1
2404     if ($self->{nc} == 0x002D) { # -
2405    
2406     $self->{state} = MD_HYPHEN_STATE;
2407    
2408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409     $self->{line_prev} = $self->{line};
2410     $self->{column_prev} = $self->{column};
2411     $self->{column}++;
2412     $self->{nc}
2413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414     } else {
2415     $self->{set_nc}->($self);
2416     }
2417    
2418     redo A;
2419     } elsif ($self->{nc} == 0x0044 or # D
2420     $self->{nc} == 0x0064) { # d
2421     ## ASCII case-insensitive.
2422    
2423     $self->{state} = MD_DOCTYPE_STATE;
2424 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2425 wakaba 1.1
2426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427     $self->{line_prev} = $self->{line};
2428     $self->{column_prev} = $self->{column};
2429     $self->{column}++;
2430     $self->{nc}
2431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432     } else {
2433     $self->{set_nc}->($self);
2434     }
2435    
2436     redo A;
2437 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439     $self->{is_xml}) and
2440 wakaba 1.1 $self->{nc} == 0x005B) { # [
2441    
2442     $self->{state} = MD_CDATA_STATE;
2443 wakaba 1.12 $self->{kwd} = '[';
2444 wakaba 1.1
2445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446     $self->{line_prev} = $self->{line};
2447     $self->{column_prev} = $self->{column};
2448     $self->{column}++;
2449     $self->{nc}
2450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451     } else {
2452     $self->{set_nc}->($self);
2453     }
2454    
2455     redo A;
2456     } else {
2457    
2458     }
2459    
2460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461     line => $self->{line_prev},
2462     column => $self->{column_prev} - 1);
2463     ## Reconsume.
2464     $self->{state} = BOGUS_COMMENT_STATE;
2465     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466     line => $self->{line_prev},
2467     column => $self->{column_prev} - 1,
2468     };
2469     redo A;
2470     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471     if ($self->{nc} == 0x002D) { # -
2472    
2473     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474     line => $self->{line_prev},
2475     column => $self->{column_prev} - 2,
2476     };
2477 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478 wakaba 1.1
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } else {
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493     line => $self->{line_prev},
2494     column => $self->{column_prev} - 2);
2495     $self->{state} = BOGUS_COMMENT_STATE;
2496     ## Reconsume.
2497     $self->{ct} = {type => COMMENT_TOKEN,
2498     data => '-',
2499     line => $self->{line_prev},
2500     column => $self->{column_prev} - 2,
2501     };
2502     redo A;
2503     }
2504     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505     ## ASCII case-insensitive.
2506     if ($self->{nc} == [
2507     undef,
2508     0x004F, # O
2509     0x0043, # C
2510     0x0054, # T
2511     0x0059, # Y
2512     0x0050, # P
2513 wakaba 1.12 ]->[length $self->{kwd}] or
2514 wakaba 1.1 $self->{nc} == [
2515     undef,
2516     0x006F, # o
2517     0x0063, # c
2518     0x0074, # t
2519     0x0079, # y
2520     0x0070, # p
2521 wakaba 1.12 ]->[length $self->{kwd}]) {
2522 wakaba 1.1
2523     ## Stay in the state.
2524 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2525 wakaba 1.1
2526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527     $self->{line_prev} = $self->{line};
2528     $self->{column_prev} = $self->{column};
2529     $self->{column}++;
2530     $self->{nc}
2531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532     } else {
2533     $self->{set_nc}->($self);
2534     }
2535    
2536     redo A;
2537 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2538 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2539     $self->{nc} == 0x0065)) { # e
2540 wakaba 1.12 if ($self->{is_xml} and
2541     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542 wakaba 1.10
2543     ## XML5: case-sensitive.
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545     text => 'DOCTYPE',
2546     line => $self->{line_prev},
2547     column => $self->{column_prev} - 5);
2548     } else {
2549    
2550     }
2551 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2552     $self->{ct} = {type => DOCTYPE_TOKEN,
2553     quirks => 1,
2554     line => $self->{line_prev},
2555     column => $self->{column_prev} - 7,
2556     };
2557    
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568     redo A;
2569     } else {
2570    
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572     line => $self->{line_prev},
2573 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2575     ## Reconsume.
2576     $self->{ct} = {type => COMMENT_TOKEN,
2577 wakaba 1.12 data => $self->{kwd},
2578 wakaba 1.1 line => $self->{line_prev},
2579 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 wakaba 1.1 };
2581     redo A;
2582     }
2583     } elsif ($self->{state} == MD_CDATA_STATE) {
2584     if ($self->{nc} == {
2585     '[' => 0x0043, # C
2586     '[C' => 0x0044, # D
2587     '[CD' => 0x0041, # A
2588     '[CDA' => 0x0054, # T
2589     '[CDAT' => 0x0041, # A
2590 wakaba 1.12 }->{$self->{kwd}}) {
2591 wakaba 1.1
2592     ## Stay in the state.
2593 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2594 wakaba 1.1
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605     redo A;
2606 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2607 wakaba 1.1 $self->{nc} == 0x005B) { # [
2608 wakaba 1.6 if ($self->{is_xml} and
2609     not $self->{tainted} and
2610     @{$self->{open_elements} or []} == 0) {
2611 wakaba 1.8
2612 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613     line => $self->{line_prev},
2614     column => $self->{column_prev} - 7);
2615     $self->{tainted} = 1;
2616 wakaba 1.8 } else {
2617    
2618 wakaba 1.6 }
2619    
2620 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2621     data => '',
2622     line => $self->{line_prev},
2623     column => $self->{column_prev} - 7};
2624     $self->{state} = CDATA_SECTION_STATE;
2625    
2626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627     $self->{line_prev} = $self->{line};
2628     $self->{column_prev} = $self->{column};
2629     $self->{column}++;
2630     $self->{nc}
2631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632     } else {
2633     $self->{set_nc}->($self);
2634     }
2635    
2636     redo A;
2637     } else {
2638    
2639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640     line => $self->{line_prev},
2641 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2643     ## Reconsume.
2644     $self->{ct} = {type => COMMENT_TOKEN,
2645 wakaba 1.12 data => $self->{kwd},
2646 wakaba 1.1 line => $self->{line_prev},
2647 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 wakaba 1.1 };
2649     redo A;
2650     }
2651     } elsif ($self->{state} == COMMENT_START_STATE) {
2652     if ($self->{nc} == 0x002D) { # -
2653    
2654     $self->{state} = COMMENT_START_DASH_STATE;
2655    
2656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657     $self->{line_prev} = $self->{line};
2658     $self->{column_prev} = $self->{column};
2659     $self->{column}++;
2660     $self->{nc}
2661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662     } else {
2663     $self->{set_nc}->($self);
2664     }
2665    
2666     redo A;
2667     } elsif ($self->{nc} == 0x003E) { # >
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 wakaba 1.13 if ($self->{in_subset}) {
2670    
2671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672     } else {
2673    
2674     $self->{state} = DATA_STATE;
2675     $self->{s_kwd} = '';
2676     }
2677 wakaba 1.1
2678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679     $self->{line_prev} = $self->{line};
2680     $self->{column_prev} = $self->{column};
2681     $self->{column}++;
2682     $self->{nc}
2683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684     } else {
2685     $self->{set_nc}->($self);
2686     }
2687    
2688    
2689     return ($self->{ct}); # comment
2690    
2691     redo A;
2692     } elsif ($self->{nc} == -1) {
2693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 wakaba 1.13 if ($self->{in_subset}) {
2695    
2696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697     } else {
2698    
2699     $self->{state} = DATA_STATE;
2700     $self->{s_kwd} = '';
2701     }
2702 wakaba 1.1 ## reconsume
2703    
2704     return ($self->{ct}); # comment
2705    
2706     redo A;
2707     } else {
2708    
2709     $self->{ct}->{data} # comment
2710     .= chr ($self->{nc});
2711     $self->{state} = COMMENT_STATE;
2712    
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723     redo A;
2724     }
2725     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726     if ($self->{nc} == 0x002D) { # -
2727    
2728     $self->{state} = COMMENT_END_STATE;
2729    
2730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731     $self->{line_prev} = $self->{line};
2732     $self->{column_prev} = $self->{column};
2733     $self->{column}++;
2734     $self->{nc}
2735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736     } else {
2737     $self->{set_nc}->($self);
2738     }
2739    
2740     redo A;
2741     } elsif ($self->{nc} == 0x003E) { # >
2742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 wakaba 1.13 if ($self->{in_subset}) {
2744    
2745     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746     } else {
2747    
2748     $self->{state} = DATA_STATE;
2749     $self->{s_kwd} = '';
2750     }
2751 wakaba 1.1
2752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753     $self->{line_prev} = $self->{line};
2754     $self->{column_prev} = $self->{column};
2755     $self->{column}++;
2756     $self->{nc}
2757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758     } else {
2759     $self->{set_nc}->($self);
2760     }
2761    
2762    
2763     return ($self->{ct}); # comment
2764    
2765     redo A;
2766     } elsif ($self->{nc} == -1) {
2767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 wakaba 1.13 if ($self->{in_subset}) {
2769    
2770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771     } else {
2772    
2773     $self->{state} = DATA_STATE;
2774     $self->{s_kwd} = '';
2775     }
2776 wakaba 1.1 ## reconsume
2777    
2778     return ($self->{ct}); # comment
2779    
2780     redo A;
2781     } else {
2782    
2783     $self->{ct}->{data} # comment
2784     .= '-' . chr ($self->{nc});
2785     $self->{state} = COMMENT_STATE;
2786    
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797     redo A;
2798     }
2799     } elsif ($self->{state} == COMMENT_STATE) {
2800 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2801    
2802 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2803    
2804     $self->{state} = COMMENT_END_DASH_STATE;
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } elsif ($self->{nc} == -1) {
2818     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 wakaba 1.13 if ($self->{in_subset}) {
2820    
2821     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822     } else {
2823    
2824     $self->{state} = DATA_STATE;
2825     $self->{s_kwd} = '';
2826     }
2827 wakaba 1.1 ## reconsume
2828    
2829     return ($self->{ct}); # comment
2830    
2831     redo A;
2832     } else {
2833    
2834     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835     $self->{read_until}->($self->{ct}->{data},
2836     q[-],
2837     length $self->{ct}->{data});
2838    
2839     ## Stay in the state
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     }
2853     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855 wakaba 1.10
2856 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2857    
2858     $self->{state} = COMMENT_END_STATE;
2859    
2860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861     $self->{line_prev} = $self->{line};
2862     $self->{column_prev} = $self->{column};
2863     $self->{column}++;
2864     $self->{nc}
2865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866     } else {
2867     $self->{set_nc}->($self);
2868     }
2869    
2870     redo A;
2871     } elsif ($self->{nc} == -1) {
2872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 wakaba 1.13 if ($self->{in_subset}) {
2874    
2875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876     } else {
2877    
2878     $self->{state} = DATA_STATE;
2879     $self->{s_kwd} = '';
2880     }
2881 wakaba 1.1 ## reconsume
2882    
2883     return ($self->{ct}); # comment
2884    
2885     redo A;
2886     } else {
2887    
2888     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889     $self->{state} = COMMENT_STATE;
2890    
2891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892     $self->{line_prev} = $self->{line};
2893     $self->{column_prev} = $self->{column};
2894     $self->{column}++;
2895     $self->{nc}
2896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897     } else {
2898     $self->{set_nc}->($self);
2899     }
2900    
2901     redo A;
2902     }
2903     } elsif ($self->{state} == COMMENT_END_STATE) {
2904 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905    
2906 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2907 wakaba 1.13 if ($self->{in_subset}) {
2908    
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910     } else {
2911    
2912     $self->{state} = DATA_STATE;
2913     $self->{s_kwd} = '';
2914     }
2915 wakaba 1.1
2916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917     $self->{line_prev} = $self->{line};
2918     $self->{column_prev} = $self->{column};
2919     $self->{column}++;
2920     $self->{nc}
2921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922     } else {
2923     $self->{set_nc}->($self);
2924     }
2925    
2926    
2927     return ($self->{ct}); # comment
2928    
2929     redo A;
2930     } elsif ($self->{nc} == 0x002D) { # -
2931    
2932 wakaba 1.10 ## XML5: Not a parse error.
2933 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934     line => $self->{line_prev},
2935     column => $self->{column_prev});
2936     $self->{ct}->{data} .= '-'; # comment
2937     ## Stay in the state
2938    
2939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940     $self->{line_prev} = $self->{line};
2941     $self->{column_prev} = $self->{column};
2942     $self->{column}++;
2943     $self->{nc}
2944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945     } else {
2946     $self->{set_nc}->($self);
2947     }
2948    
2949     redo A;
2950     } elsif ($self->{nc} == -1) {
2951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1 ## reconsume
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } else {
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972     $self->{state} = COMMENT_STATE;
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     }
2986     } elsif ($self->{state} == DOCTYPE_STATE) {
2987     if ($is_space->{$self->{nc}}) {
2988    
2989     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990    
2991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002     } else {
3003    
3004 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3005 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007     ## reconsume
3008     redo A;
3009     }
3010     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3012    
3013 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3014    
3015     ## Stay in the state
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     } elsif ($self->{nc} == 0x003E) { # >
3029    
3030 wakaba 1.12 ## XML5: No parse error.
3031 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032     $self->{state} = DATA_STATE;
3033 wakaba 1.5 $self->{s_kwd} = '';
3034 wakaba 1.1
3035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036     $self->{line_prev} = $self->{line};
3037     $self->{column_prev} = $self->{column};
3038     $self->{column}++;
3039     $self->{nc}
3040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041     } else {
3042     $self->{set_nc}->($self);
3043     }
3044    
3045    
3046     return ($self->{ct}); # DOCTYPE (quirks)
3047    
3048     redo A;
3049     } elsif ($self->{nc} == -1) {
3050    
3051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052     $self->{state} = DATA_STATE;
3053 wakaba 1.5 $self->{s_kwd} = '';
3054 wakaba 1.1 ## reconsume
3055    
3056     return ($self->{ct}); # DOCTYPE (quirks)
3057    
3058     redo A;
3059 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060    
3061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064     $self->{in_subset} = 1;
3065 wakaba 1.12
3066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067     $self->{line_prev} = $self->{line};
3068     $self->{column_prev} = $self->{column};
3069     $self->{column}++;
3070     $self->{nc}
3071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072     } else {
3073     $self->{set_nc}->($self);
3074     }
3075    
3076 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3077 wakaba 1.12 redo A;
3078 wakaba 1.1 } else {
3079    
3080     $self->{ct}->{name} = chr $self->{nc};
3081     delete $self->{ct}->{quirks};
3082     $self->{state} = DOCTYPE_NAME_STATE;
3083    
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094     redo A;
3095     }
3096     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3098    
3099     ## ISSUE: Redundant "First," in the spec.
3100    
3101 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3102    
3103     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x003E) { # >
3117    
3118     $self->{state} = DATA_STATE;
3119 wakaba 1.5 $self->{s_kwd} = '';
3120 wakaba 1.1
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131    
3132     return ($self->{ct}); # DOCTYPE
3133    
3134     redo A;
3135     } elsif ($self->{nc} == -1) {
3136    
3137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138     $self->{state} = DATA_STATE;
3139 wakaba 1.5 $self->{s_kwd} = '';
3140 wakaba 1.1 ## reconsume
3141    
3142     $self->{ct}->{quirks} = 1;
3143     return ($self->{ct}); # DOCTYPE
3144    
3145     redo A;
3146 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147    
3148     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150     $self->{in_subset} = 1;
3151 wakaba 1.12
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3163 wakaba 1.12 redo A;
3164 wakaba 1.1 } else {
3165    
3166     $self->{ct}->{name}
3167     .= chr ($self->{nc}); # DOCTYPE
3168     ## Stay in the state
3169    
3170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171     $self->{line_prev} = $self->{line};
3172     $self->{column_prev} = $self->{column};
3173     $self->{column}++;
3174     $self->{nc}
3175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176     } else {
3177     $self->{set_nc}->($self);
3178     }
3179    
3180     redo A;
3181     }
3182     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184     ## state", but implemented differently.
3185    
3186 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3187    
3188     ## Stay in the state
3189    
3190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191     $self->{line_prev} = $self->{line};
3192     $self->{column_prev} = $self->{column};
3193     $self->{column}++;
3194     $self->{nc}
3195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196     } else {
3197     $self->{set_nc}->($self);
3198     }
3199    
3200     redo A;
3201     } elsif ($self->{nc} == 0x003E) { # >
3202 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203    
3204     $self->{state} = DATA_STATE;
3205     $self->{s_kwd} = '';
3206     } else {
3207    
3208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210     }
3211 wakaba 1.1
3212    
3213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214     $self->{line_prev} = $self->{line};
3215     $self->{column_prev} = $self->{column};
3216     $self->{column}++;
3217     $self->{nc}
3218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219     } else {
3220     $self->{set_nc}->($self);
3221     }
3222    
3223 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 wakaba 1.1 redo A;
3225     } elsif ($self->{nc} == -1) {
3226 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227    
3228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229     $self->{state} = DATA_STATE;
3230     $self->{s_kwd} = '';
3231     $self->{ct}->{quirks} = 1;
3232     } else {
3233    
3234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236     }
3237 wakaba 1.1
3238 wakaba 1.16 ## Reconsume.
3239     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 wakaba 1.1 redo A;
3241     } elsif ($self->{nc} == 0x0050 or # P
3242     $self->{nc} == 0x0070) { # p
3243 wakaba 1.12
3244 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3245 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3246 wakaba 1.1
3247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248     $self->{line_prev} = $self->{line};
3249     $self->{column_prev} = $self->{column};
3250     $self->{column}++;
3251     $self->{nc}
3252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253     } else {
3254     $self->{set_nc}->($self);
3255     }
3256    
3257     redo A;
3258     } elsif ($self->{nc} == 0x0053 or # S
3259     $self->{nc} == 0x0073) { # s
3260 wakaba 1.12
3261 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3262 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3276     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278    
3279     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280     $self->{ct}->{value} = ''; # ENTITY
3281    
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     } elsif ($self->{nc} == 0x0027 and # '
3294     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296    
3297     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298     $self->{ct}->{value} = ''; # ENTITY
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311 wakaba 1.16 } elsif ($self->{is_xml} and
3312     $self->{ct}->{type} == DOCTYPE_TOKEN and
3313     $self->{nc} == 0x005B) { # [
3314 wakaba 1.12
3315     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 wakaba 1.13 $self->{in_subset} = 1;
3318 wakaba 1.1
3319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320     $self->{line_prev} = $self->{line};
3321     $self->{column_prev} = $self->{column};
3322     $self->{column}++;
3323     $self->{nc}
3324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325     } else {
3326     $self->{set_nc}->($self);
3327     }
3328    
3329 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3330 wakaba 1.1 redo A;
3331     } else {
3332 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333    
3334     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335    
3336     $self->{ct}->{quirks} = 1;
3337     $self->{state} = BOGUS_DOCTYPE_STATE;
3338     } else {
3339    
3340     $self->{state} = BOGUS_MD_STATE;
3341     }
3342 wakaba 1.1
3343    
3344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345     $self->{line_prev} = $self->{line};
3346     $self->{column_prev} = $self->{column};
3347     $self->{column}++;
3348     $self->{nc}
3349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350     } else {
3351     $self->{set_nc}->($self);
3352     }
3353    
3354     redo A;
3355     }
3356     } elsif ($self->{state} == PUBLIC_STATE) {
3357     ## ASCII case-insensitive
3358     if ($self->{nc} == [
3359     undef,
3360     0x0055, # U
3361     0x0042, # B
3362     0x004C, # L
3363     0x0049, # I
3364 wakaba 1.12 ]->[length $self->{kwd}] or
3365 wakaba 1.1 $self->{nc} == [
3366     undef,
3367     0x0075, # u
3368     0x0062, # b
3369     0x006C, # l
3370     0x0069, # i
3371 wakaba 1.12 ]->[length $self->{kwd}]) {
3372 wakaba 1.1
3373     ## Stay in the state.
3374 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3375 wakaba 1.1
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3388 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3389     $self->{nc} == 0x0063)) { # c
3390 wakaba 1.12 if ($self->{is_xml} and
3391     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392    
3393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394     text => 'PUBLIC',
3395     line => $self->{line_prev},
3396     column => $self->{column_prev} - 4);
3397     } else {
3398    
3399     }
3400 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     } else {
3414 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 wakaba 1.1 line => $self->{line_prev},
3416 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418    
3419     $self->{ct}->{quirks} = 1;
3420     $self->{state} = BOGUS_DOCTYPE_STATE;
3421     } else {
3422    
3423     $self->{state} = BOGUS_MD_STATE;
3424     }
3425 wakaba 1.1 ## Reconsume.
3426     redo A;
3427     }
3428     } elsif ($self->{state} == SYSTEM_STATE) {
3429     ## ASCII case-insensitive
3430     if ($self->{nc} == [
3431     undef,
3432     0x0059, # Y
3433     0x0053, # S
3434     0x0054, # T
3435     0x0045, # E
3436 wakaba 1.12 ]->[length $self->{kwd}] or
3437 wakaba 1.1 $self->{nc} == [
3438     undef,
3439     0x0079, # y
3440     0x0073, # s
3441     0x0074, # t
3442     0x0065, # e
3443 wakaba 1.12 ]->[length $self->{kwd}]) {
3444 wakaba 1.1
3445     ## Stay in the state.
3446 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3447 wakaba 1.1
3448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449     $self->{line_prev} = $self->{line};
3450     $self->{column_prev} = $self->{column};
3451     $self->{column}++;
3452     $self->{nc}
3453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454     } else {
3455     $self->{set_nc}->($self);
3456     }
3457    
3458     redo A;
3459 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3460 wakaba 1.1 ($self->{nc} == 0x004D or # M
3461     $self->{nc} == 0x006D)) { # m
3462 wakaba 1.12 if ($self->{is_xml} and
3463     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464    
3465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466     text => 'SYSTEM',
3467     line => $self->{line_prev},
3468     column => $self->{column_prev} - 4);
3469     } else {
3470    
3471     }
3472 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473    
3474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475     $self->{line_prev} = $self->{line};
3476     $self->{column_prev} = $self->{column};
3477     $self->{column}++;
3478     $self->{nc}
3479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480     } else {
3481     $self->{set_nc}->($self);
3482     }
3483    
3484     redo A;
3485     } else {
3486 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 wakaba 1.1 line => $self->{line_prev},
3488 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490    
3491     $self->{ct}->{quirks} = 1;
3492     $self->{state} = BOGUS_DOCTYPE_STATE;
3493     } else {
3494    
3495     $self->{state} = BOGUS_MD_STATE;
3496     }
3497 wakaba 1.1 ## Reconsume.
3498     redo A;
3499     }
3500     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501     if ($is_space->{$self->{nc}}) {
3502    
3503     ## Stay in the state
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } elsif ($self->{nc} eq 0x0022) { # "
3517    
3518     $self->{ct}->{pubid} = ''; # DOCTYPE
3519     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520    
3521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522     $self->{line_prev} = $self->{line};
3523     $self->{column_prev} = $self->{column};
3524     $self->{column}++;
3525     $self->{nc}
3526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527     } else {
3528     $self->{set_nc}->($self);
3529     }
3530    
3531     redo A;
3532     } elsif ($self->{nc} eq 0x0027) { # '
3533    
3534     $self->{ct}->{pubid} = ''; # DOCTYPE
3535     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536    
3537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538     $self->{line_prev} = $self->{line};
3539     $self->{column_prev} = $self->{column};
3540     $self->{column}++;
3541     $self->{nc}
3542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543     } else {
3544     $self->{set_nc}->($self);
3545     }
3546    
3547     redo A;
3548     } elsif ($self->{nc} eq 0x003E) { # >
3549 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550    
3551     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552    
3553     $self->{state} = DATA_STATE;
3554     $self->{s_kwd} = '';
3555     $self->{ct}->{quirks} = 1;
3556     } else {
3557    
3558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559     }
3560 wakaba 1.1
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 wakaba 1.1 redo A;
3574     } elsif ($self->{nc} == -1) {
3575 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576    
3577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578     $self->{state} = DATA_STATE;
3579     $self->{s_kwd} = '';
3580     $self->{ct}->{quirks} = 1;
3581     } else {
3582    
3583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585     }
3586 wakaba 1.1
3587     ## reconsume
3588     return ($self->{ct}); # DOCTYPE
3589     redo A;
3590 wakaba 1.16 } elsif ($self->{is_xml} and
3591     $self->{ct}->{type} == DOCTYPE_TOKEN and
3592     $self->{nc} == 0x005B) { # [
3593 wakaba 1.12
3594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 wakaba 1.13 $self->{in_subset} = 1;
3598 wakaba 1.12
3599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600     $self->{line_prev} = $self->{line};
3601     $self->{column_prev} = $self->{column};
3602     $self->{column}++;
3603     $self->{nc}
3604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605     } else {
3606     $self->{set_nc}->($self);
3607     }
3608    
3609 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3610 wakaba 1.12 redo A;
3611 wakaba 1.1 } else {
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613    
3614 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615    
3616     $self->{ct}->{quirks} = 1;
3617     $self->{state} = BOGUS_DOCTYPE_STATE;
3618     } else {
3619    
3620     $self->{state} = BOGUS_MD_STATE;
3621     }
3622    
3623 wakaba 1.1
3624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625     $self->{line_prev} = $self->{line};
3626     $self->{column_prev} = $self->{column};
3627     $self->{column}++;
3628     $self->{nc}
3629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630     } else {
3631     $self->{set_nc}->($self);
3632     }
3633    
3634     redo A;
3635     }
3636     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637     if ($self->{nc} == 0x0022) { # "
3638    
3639     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640    
3641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642     $self->{line_prev} = $self->{line};
3643     $self->{column_prev} = $self->{column};
3644     $self->{column}++;
3645     $self->{nc}
3646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647     } else {
3648     $self->{set_nc}->($self);
3649     }
3650    
3651     redo A;
3652     } elsif ($self->{nc} == 0x003E) { # >
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654    
3655 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656    
3657     $self->{state} = DATA_STATE;
3658     $self->{s_kwd} = '';
3659     $self->{ct}->{quirks} = 1;
3660     } else {
3661    
3662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663     }
3664    
3665 wakaba 1.1
3666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667     $self->{line_prev} = $self->{line};
3668     $self->{column_prev} = $self->{column};
3669     $self->{column}++;
3670     $self->{nc}
3671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672     } else {
3673     $self->{set_nc}->($self);
3674     }
3675    
3676 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 wakaba 1.1 redo A;
3678     } elsif ($self->{nc} == -1) {
3679     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680    
3681 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682    
3683     $self->{state} = DATA_STATE;
3684     $self->{s_kwd} = '';
3685     $self->{ct}->{quirks} = 1;
3686     } else {
3687    
3688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689     }
3690    
3691     ## Reconsume.
3692 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3693     redo A;
3694     } else {
3695    
3696 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698     length $self->{ct}->{pubid});
3699    
3700     ## Stay in the state
3701    
3702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703     $self->{line_prev} = $self->{line};
3704     $self->{column_prev} = $self->{column};
3705     $self->{column}++;
3706     $self->{nc}
3707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708     } else {
3709     $self->{set_nc}->($self);
3710     }
3711    
3712     redo A;
3713     }
3714     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715     if ($self->{nc} == 0x0027) { # '
3716    
3717     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718    
3719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720     $self->{line_prev} = $self->{line};
3721     $self->{column_prev} = $self->{column};
3722     $self->{column}++;
3723     $self->{nc}
3724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725     } else {
3726     $self->{set_nc}->($self);
3727     }
3728    
3729     redo A;
3730     } elsif ($self->{nc} == 0x003E) { # >
3731     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732    
3733 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734    
3735     $self->{state} = DATA_STATE;
3736     $self->{s_kwd} = '';
3737     $self->{ct}->{quirks} = 1;
3738     } else {
3739    
3740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741     }
3742    
3743 wakaba 1.1
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 wakaba 1.1 redo A;
3756     } elsif ($self->{nc} == -1) {
3757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758    
3759 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760    
3761     $self->{state} = DATA_STATE;
3762     $self->{s_kwd} = '';
3763     $self->{ct}->{quirks} = 1;
3764     } else {
3765    
3766     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767     }
3768    
3769 wakaba 1.1 ## reconsume
3770 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 wakaba 1.1 redo A;
3772     } else {
3773    
3774 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776     length $self->{ct}->{pubid});
3777    
3778     ## Stay in the state
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     }
3792     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793     if ($is_space->{$self->{nc}}) {
3794    
3795     ## Stay in the state
3796    
3797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798     $self->{line_prev} = $self->{line};
3799     $self->{column_prev} = $self->{column};
3800     $self->{column}++;
3801     $self->{nc}
3802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803     } else {
3804     $self->{set_nc}->($self);
3805     }
3806    
3807     redo A;
3808     } elsif ($self->{nc} == 0x0022) { # "
3809    
3810 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812    
3813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814     $self->{line_prev} = $self->{line};
3815     $self->{column_prev} = $self->{column};
3816     $self->{column}++;
3817     $self->{nc}
3818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819     } else {
3820     $self->{set_nc}->($self);
3821     }
3822    
3823     redo A;
3824     } elsif ($self->{nc} == 0x0027) { # '
3825    
3826 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828    
3829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830     $self->{line_prev} = $self->{line};
3831     $self->{column_prev} = $self->{column};
3832     $self->{column}++;
3833     $self->{nc}
3834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835     } else {
3836     $self->{set_nc}->($self);
3837     }
3838    
3839     redo A;
3840     } elsif ($self->{nc} == 0x003E) { # >
3841 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842     if ($self->{is_xml}) {
3843    
3844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845     } else {
3846    
3847     }
3848     $self->{state} = DATA_STATE;
3849     $self->{s_kwd} = '';
3850 wakaba 1.12 } else {
3851 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852    
3853     } else {
3854    
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856     }
3857     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 wakaba 1.12 }
3859 wakaba 1.16
3860 wakaba 1.1
3861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862     $self->{line_prev} = $self->{line};
3863     $self->{column_prev} = $self->{column};
3864     $self->{column}++;
3865     $self->{nc}
3866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867     } else {
3868     $self->{set_nc}->($self);
3869     }
3870    
3871 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 wakaba 1.1 redo A;
3873     } elsif ($self->{nc} == -1) {
3874 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875    
3876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877    
3878     $self->{state} = DATA_STATE;
3879     $self->{s_kwd} = '';
3880     $self->{ct}->{quirks} = 1;
3881     } else {
3882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884     }
3885 wakaba 1.1
3886     ## reconsume
3887 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 wakaba 1.1 redo A;
3889 wakaba 1.16 } elsif ($self->{is_xml} and
3890     $self->{ct}->{type} == DOCTYPE_TOKEN and
3891     $self->{nc} == 0x005B) { # [
3892 wakaba 1.12
3893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 wakaba 1.13 $self->{in_subset} = 1;
3897 wakaba 1.12
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3909 wakaba 1.12 redo A;
3910 wakaba 1.1 } else {
3911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912    
3913 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914    
3915     $self->{ct}->{quirks} = 1;
3916     $self->{state} = BOGUS_DOCTYPE_STATE;
3917     } else {
3918    
3919     $self->{state} = BOGUS_MD_STATE;
3920     }
3921    
3922 wakaba 1.1
3923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924     $self->{line_prev} = $self->{line};
3925     $self->{column_prev} = $self->{column};
3926     $self->{column}++;
3927     $self->{nc}
3928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929     } else {
3930     $self->{set_nc}->($self);
3931     }
3932    
3933     redo A;
3934     }
3935     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936     if ($is_space->{$self->{nc}}) {
3937    
3938     ## Stay in the state
3939    
3940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941     $self->{line_prev} = $self->{line};
3942     $self->{column_prev} = $self->{column};
3943     $self->{column}++;
3944     $self->{nc}
3945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946     } else {
3947     $self->{set_nc}->($self);
3948     }
3949    
3950     redo A;
3951     } elsif ($self->{nc} == 0x0022) { # "
3952    
3953     $self->{ct}->{sysid} = ''; # DOCTYPE
3954     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955    
3956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957     $self->{line_prev} = $self->{line};
3958     $self->{column_prev} = $self->{column};
3959     $self->{column}++;
3960     $self->{nc}
3961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962     } else {
3963     $self->{set_nc}->($self);
3964     }
3965    
3966     redo A;
3967     } elsif ($self->{nc} == 0x0027) { # '
3968    
3969     $self->{ct}->{sysid} = ''; # DOCTYPE
3970     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971    
3972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973     $self->{line_prev} = $self->{line};
3974     $self->{column_prev} = $self->{column};
3975     $self->{column}++;
3976     $self->{nc}
3977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978     } else {
3979     $self->{set_nc}->($self);
3980     }
3981    
3982     redo A;
3983     } elsif ($self->{nc} == 0x003E) { # >
3984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987     $self->{line_prev} = $self->{line};
3988     $self->{column_prev} = $self->{column};
3989     $self->{column}++;
3990     $self->{nc}
3991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992     } else {
3993     $self->{set_nc}->($self);
3994     }
3995    
3996    
3997 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998    
3999     $self->{state} = DATA_STATE;
4000     $self->{s_kwd} = '';
4001     $self->{ct}->{quirks} = 1;
4002     } else {
4003    
4004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005     }
4006 wakaba 1.1
4007 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 wakaba 1.1 redo A;
4009     } elsif ($self->{nc} == -1) {
4010 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011    
4012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013     $self->{state} = DATA_STATE;
4014     $self->{s_kwd} = '';
4015     $self->{ct}->{quirks} = 1;
4016     } else {
4017    
4018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020     }
4021 wakaba 1.1
4022     ## reconsume
4023 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 wakaba 1.1 redo A;
4025 wakaba 1.16 } elsif ($self->{is_xml} and
4026     $self->{ct}->{type} == DOCTYPE_TOKEN and
4027     $self->{nc} == 0x005B) { # [
4028 wakaba 1.12
4029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030    
4031     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 wakaba 1.13 $self->{in_subset} = 1;
4034 wakaba 1.12
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4046 wakaba 1.12 redo A;
4047 wakaba 1.1 } else {
4048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049    
4050 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051    
4052     $self->{ct}->{quirks} = 1;
4053     $self->{state} = BOGUS_DOCTYPE_STATE;
4054     } else {
4055    
4056     $self->{state} = BOGUS_MD_STATE;
4057     }
4058    
4059 wakaba 1.1
4060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061     $self->{line_prev} = $self->{line};
4062     $self->{column_prev} = $self->{column};
4063     $self->{column}++;
4064     $self->{nc}
4065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066     } else {
4067     $self->{set_nc}->($self);
4068     }
4069    
4070     redo A;
4071     }
4072     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073     if ($self->{nc} == 0x0022) { # "
4074    
4075     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076    
4077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078     $self->{line_prev} = $self->{line};
4079     $self->{column_prev} = $self->{column};
4080     $self->{column}++;
4081     $self->{nc}
4082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083     } else {
4084     $self->{set_nc}->($self);
4085     }
4086    
4087     redo A;
4088 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090    
4091 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092    
4093     $self->{state} = DATA_STATE;
4094     $self->{s_kwd} = '';
4095     $self->{ct}->{quirks} = 1;
4096     } else {
4097    
4098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099     }
4100    
4101 wakaba 1.1
4102     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103     $self->{line_prev} = $self->{line};
4104     $self->{column_prev} = $self->{column};
4105     $self->{column}++;
4106     $self->{nc}
4107     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108     } else {
4109     $self->{set_nc}->($self);
4110     }
4111    
4112 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 wakaba 1.1 redo A;
4114     } elsif ($self->{nc} == -1) {
4115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116    
4117 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118    
4119     $self->{state} = DATA_STATE;
4120     $self->{s_kwd} = '';
4121     $self->{ct}->{quirks} = 1;
4122     } else {
4123    
4124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125     }
4126    
4127 wakaba 1.1 ## reconsume
4128 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 wakaba 1.1 redo A;
4130     } else {
4131    
4132 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134     length $self->{ct}->{sysid});
4135    
4136     ## Stay in the state
4137    
4138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139     $self->{line_prev} = $self->{line};
4140     $self->{column_prev} = $self->{column};
4141     $self->{column}++;
4142     $self->{nc}
4143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144     } else {
4145     $self->{set_nc}->($self);
4146     }
4147    
4148     redo A;
4149     }
4150     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151     if ($self->{nc} == 0x0027) { # '
4152    
4153     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154    
4155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156     $self->{line_prev} = $self->{line};
4157     $self->{column_prev} = $self->{column};
4158     $self->{column}++;
4159     $self->{nc}
4160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161     } else {
4162     $self->{set_nc}->($self);
4163     }
4164    
4165     redo A;
4166 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167 wakaba 1.1
4168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169    
4170     $self->{state} = DATA_STATE;
4171 wakaba 1.5 $self->{s_kwd} = '';
4172 wakaba 1.1
4173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174     $self->{line_prev} = $self->{line};
4175     $self->{column_prev} = $self->{column};
4176     $self->{column}++;
4177     $self->{nc}
4178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179     } else {
4180     $self->{set_nc}->($self);
4181     }
4182    
4183    
4184     $self->{ct}->{quirks} = 1;
4185     return ($self->{ct}); # DOCTYPE
4186    
4187     redo A;
4188     } elsif ($self->{nc} == -1) {
4189     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190    
4191 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192    
4193     $self->{state} = DATA_STATE;
4194     $self->{s_kwd} = '';
4195     $self->{ct}->{quirks} = 1;
4196     } else {
4197    
4198     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199     }
4200    
4201 wakaba 1.1 ## reconsume
4202 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 wakaba 1.1 redo A;
4204     } else {
4205    
4206 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208     length $self->{ct}->{sysid});
4209    
4210     ## Stay in the state
4211    
4212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213     $self->{line_prev} = $self->{line};
4214     $self->{column_prev} = $self->{column};
4215     $self->{column}++;
4216     $self->{nc}
4217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218     } else {
4219     $self->{set_nc}->($self);
4220     }
4221    
4222     redo A;
4223     }
4224     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225     if ($is_space->{$self->{nc}}) {
4226 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227    
4228     $self->{state} = BEFORE_NDATA_STATE;
4229     } else {
4230    
4231     ## Stay in the state
4232     }
4233 wakaba 1.1
4234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235     $self->{line_prev} = $self->{line};
4236     $self->{column_prev} = $self->{column};
4237     $self->{column}++;
4238     $self->{nc}
4239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240     } else {
4241     $self->{set_nc}->($self);
4242     }
4243    
4244     redo A;
4245     } elsif ($self->{nc} == 0x003E) { # >
4246 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247    
4248     $self->{state} = DATA_STATE;
4249     $self->{s_kwd} = '';
4250     } else {
4251    
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253     }
4254    
4255 wakaba 1.1
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 wakaba 1.1 redo A;
4268 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269     ($self->{nc} == 0x004E or # N
4270     $self->{nc} == 0x006E)) { # n
4271    
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273     $self->{state} = NDATA_STATE;
4274     $self->{kwd} = chr $self->{nc};
4275    
4276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277     $self->{line_prev} = $self->{line};
4278     $self->{column_prev} = $self->{column};
4279     $self->{column}++;
4280     $self->{nc}
4281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282     } else {
4283     $self->{set_nc}->($self);
4284     }
4285    
4286     redo A;
4287 wakaba 1.1 } elsif ($self->{nc} == -1) {
4288 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289    
4290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291     $self->{state} = DATA_STATE;
4292     $self->{s_kwd} = '';
4293     $self->{ct}->{quirks} = 1;
4294     } else {
4295    
4296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298     }
4299    
4300 wakaba 1.1 ## reconsume
4301 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 wakaba 1.1 redo A;
4303 wakaba 1.16 } elsif ($self->{is_xml} and
4304     $self->{ct}->{type} == DOCTYPE_TOKEN and
4305     $self->{nc} == 0x005B) { # [
4306 wakaba 1.12
4307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 wakaba 1.13 $self->{in_subset} = 1;
4310 wakaba 1.12
4311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312     $self->{line_prev} = $self->{line};
4313     $self->{column_prev} = $self->{column};
4314     $self->{column}++;
4315     $self->{nc}
4316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317     } else {
4318     $self->{set_nc}->($self);
4319     }
4320    
4321 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4322 wakaba 1.12 redo A;
4323 wakaba 1.1 } else {
4324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325    
4326 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327    
4328     #$self->{ct}->{quirks} = 1;
4329     $self->{state} = BOGUS_DOCTYPE_STATE;
4330     } else {
4331    
4332     $self->{state} = BOGUS_MD_STATE;
4333     }
4334    
4335 wakaba 1.1
4336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337     $self->{line_prev} = $self->{line};
4338     $self->{column_prev} = $self->{column};
4339     $self->{column}++;
4340     $self->{nc}
4341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342     } else {
4343     $self->{set_nc}->($self);
4344     }
4345    
4346     redo A;
4347     }
4348 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349     if ($is_space->{$self->{nc}}) {
4350    
4351     ## Stay in the state.
4352    
4353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354     $self->{line_prev} = $self->{line};
4355     $self->{column_prev} = $self->{column};
4356     $self->{column}++;
4357     $self->{nc}
4358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359     } else {
4360     $self->{set_nc}->($self);
4361     }
4362    
4363     redo A;
4364     } elsif ($self->{nc} == 0x003E) { # >
4365    
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367    
4368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369     $self->{line_prev} = $self->{line};
4370     $self->{column_prev} = $self->{column};
4371     $self->{column}++;
4372     $self->{nc}
4373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374     } else {
4375     $self->{set_nc}->($self);
4376     }
4377    
4378     return ($self->{ct}); # ENTITY
4379     redo A;
4380     } elsif ($self->{nc} == 0x004E or # N
4381     $self->{nc} == 0x006E) { # n
4382    
4383     $self->{state} = NDATA_STATE;
4384     $self->{kwd} = chr $self->{nc};
4385    
4386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387     $self->{line_prev} = $self->{line};
4388     $self->{column_prev} = $self->{column};
4389     $self->{column}++;
4390     $self->{nc}
4391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392     } else {
4393     $self->{set_nc}->($self);
4394     }
4395    
4396     redo A;
4397     } elsif ($self->{nc} == -1) {
4398    
4399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401     ## reconsume
4402     return ($self->{ct}); # ENTITY
4403     redo A;
4404     } else {
4405    
4406     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407     $self->{state} = BOGUS_MD_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     }
4421 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422     if ($self->{nc} == 0x003E) { # >
4423    
4424     $self->{state} = DATA_STATE;
4425 wakaba 1.5 $self->{s_kwd} = '';
4426 wakaba 1.1
4427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428     $self->{line_prev} = $self->{line};
4429     $self->{column_prev} = $self->{column};
4430     $self->{column}++;
4431     $self->{nc}
4432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433     } else {
4434     $self->{set_nc}->($self);
4435     }
4436    
4437    
4438     return ($self->{ct}); # DOCTYPE
4439    
4440     redo A;
4441 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442 wakaba 1.13
4443     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445     $self->{in_subset} = 1;
4446    
4447 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448     $self->{line_prev} = $self->{line};
4449     $self->{column_prev} = $self->{column};
4450     $self->{column}++;
4451     $self->{nc}
4452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453     } else {
4454     $self->{set_nc}->($self);
4455     }
4456    
4457 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4458     redo A;
4459 wakaba 1.1 } elsif ($self->{nc} == -1) {
4460    
4461     $self->{state} = DATA_STATE;
4462 wakaba 1.5 $self->{s_kwd} = '';
4463 wakaba 1.1 ## reconsume
4464    
4465     return ($self->{ct}); # DOCTYPE
4466    
4467     redo A;
4468     } else {
4469    
4470     my $s = '';
4471 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4472 wakaba 1.1
4473     ## Stay in the state
4474    
4475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476     $self->{line_prev} = $self->{line};
4477     $self->{column_prev} = $self->{column};
4478     $self->{column}++;
4479     $self->{nc}
4480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481     } else {
4482     $self->{set_nc}->($self);
4483     }
4484    
4485     redo A;
4486     }
4487     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488     ## NOTE: "CDATA section state" in the state is jointly implemented
4489     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490     ## and |CDATA_SECTION_MSE2_STATE|.
4491 wakaba 1.10
4492     ## XML5: "CDATA state".
4493 wakaba 1.1
4494     if ($self->{nc} == 0x005D) { # ]
4495    
4496     $self->{state} = CDATA_SECTION_MSE1_STATE;
4497    
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510 wakaba 1.6 if ($self->{is_xml}) {
4511 wakaba 1.8
4512 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 wakaba 1.8 } else {
4514    
4515 wakaba 1.6 }
4516    
4517 wakaba 1.1 $self->{state} = DATA_STATE;
4518 wakaba 1.5 $self->{s_kwd} = '';
4519 wakaba 1.10 ## Reconsume.
4520 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4521    
4522     return ($self->{ct}); # character
4523     } else {
4524    
4525     ## No token to emit. $self->{ct} is discarded.
4526     }
4527     redo A;
4528     } else {
4529    
4530     $self->{ct}->{data} .= chr $self->{nc};
4531     $self->{read_until}->($self->{ct}->{data},
4532     q<]>,
4533     length $self->{ct}->{data});
4534    
4535     ## Stay in the state.
4536    
4537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538     $self->{line_prev} = $self->{line};
4539     $self->{column_prev} = $self->{column};
4540     $self->{column}++;
4541     $self->{nc}
4542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543     } else {
4544     $self->{set_nc}->($self);
4545     }
4546    
4547     redo A;
4548     }
4549    
4550     ## ISSUE: "text tokens" in spec.
4551     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 wakaba 1.10 ## XML5: "CDATA bracket state".
4553    
4554 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4555    
4556     $self->{state} = CDATA_SECTION_MSE2_STATE;
4557    
4558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559     $self->{line_prev} = $self->{line};
4560     $self->{column_prev} = $self->{column};
4561     $self->{column}++;
4562     $self->{nc}
4563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564     } else {
4565     $self->{set_nc}->($self);
4566     }
4567    
4568     redo A;
4569     } else {
4570    
4571 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 wakaba 1.1 $self->{ct}->{data} .= ']';
4573 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 wakaba 1.1 ## Reconsume.
4575     redo A;
4576     }
4577     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 wakaba 1.10 ## XML5: "CDATA end state".
4579    
4580 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4581     $self->{state} = DATA_STATE;
4582 wakaba 1.5 $self->{s_kwd} = '';
4583 wakaba 1.1
4584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585     $self->{line_prev} = $self->{line};
4586     $self->{column_prev} = $self->{column};
4587     $self->{column}++;
4588     $self->{nc}
4589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590     } else {
4591     $self->{set_nc}->($self);
4592     }
4593    
4594     if (length $self->{ct}->{data}) { # character
4595    
4596     return ($self->{ct}); # character
4597     } else {
4598    
4599     ## No token to emit. $self->{ct} is discarded.
4600     }
4601     redo A;
4602     } elsif ($self->{nc} == 0x005D) { # ]
4603     # character
4604     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605     ## Stay in the state.
4606    
4607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608     $self->{line_prev} = $self->{line};
4609     $self->{column_prev} = $self->{column};
4610     $self->{column}++;
4611     $self->{nc}
4612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613     } else {
4614     $self->{set_nc}->($self);
4615     }
4616    
4617     redo A;
4618     } else {
4619    
4620     $self->{ct}->{data} .= ']]'; # character
4621     $self->{state} = CDATA_SECTION_STATE;
4622 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4623 wakaba 1.1 redo A;
4624     }
4625     } elsif ($self->{state} == ENTITY_STATE) {
4626     if ($is_space->{$self->{nc}} or
4627     {
4628     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629     $self->{entity_add} => 1,
4630     }->{$self->{nc}}) {
4631    
4632     ## Don't consume
4633     ## No error
4634     ## Return nothing.
4635     #
4636     } elsif ($self->{nc} == 0x0023) { # #
4637    
4638     $self->{state} = ENTITY_HASH_STATE;
4639 wakaba 1.12 $self->{kwd} = '#';
4640 wakaba 1.1
4641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642     $self->{line_prev} = $self->{line};
4643     $self->{column_prev} = $self->{column};
4644     $self->{column}++;
4645     $self->{nc}
4646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647     } else {
4648     $self->{set_nc}->($self);
4649     }
4650    
4651     redo A;
4652     } elsif ((0x0041 <= $self->{nc} and
4653     $self->{nc} <= 0x005A) or # A..Z
4654     (0x0061 <= $self->{nc} and
4655     $self->{nc} <= 0x007A)) { # a..z
4656    
4657     require Whatpm::_NamedEntityList;
4658     $self->{state} = ENTITY_NAME_STATE;
4659 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4660     $self->{entity__value} = $self->{kwd};
4661 wakaba 1.1 $self->{entity__match} = 0;
4662    
4663     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4664     $self->{line_prev} = $self->{line};
4665     $self->{column_prev} = $self->{column};
4666     $self->{column}++;
4667     $self->{nc}
4668     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4669     } else {
4670     $self->{set_nc}->($self);
4671     }
4672    
4673     redo A;
4674     } else {
4675    
4676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4677     ## Return nothing.
4678     #
4679     }
4680    
4681     ## NOTE: No character is consumed by the "consume a character
4682     ## reference" algorithm. In other word, there is an "&" character
4683     ## that does not introduce a character reference, which would be
4684     ## appended to the parent element or the attribute value in later
4685     ## process of the tokenizer.
4686    
4687     if ($self->{prev_state} == DATA_STATE) {
4688    
4689     $self->{state} = $self->{prev_state};
4690 wakaba 1.5 $self->{s_kwd} = '';
4691 wakaba 1.1 ## Reconsume.
4692     return ({type => CHARACTER_TOKEN, data => '&',
4693     line => $self->{line_prev},
4694     column => $self->{column_prev},
4695     });
4696     redo A;
4697     } else {
4698    
4699     $self->{ca}->{value} .= '&';
4700     $self->{state} = $self->{prev_state};
4701 wakaba 1.5 $self->{s_kwd} = '';
4702 wakaba 1.1 ## Reconsume.
4703     redo A;
4704     }
4705     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4706     if ($self->{nc} == 0x0078 or # x
4707     $self->{nc} == 0x0058) { # X
4708    
4709     $self->{state} = HEXREF_X_STATE;
4710 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4711 wakaba 1.1
4712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4713     $self->{line_prev} = $self->{line};
4714     $self->{column_prev} = $self->{column};
4715     $self->{column}++;
4716     $self->{nc}
4717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4718     } else {
4719     $self->{set_nc}->($self);
4720     }
4721    
4722     redo A;
4723     } elsif (0x0030 <= $self->{nc} and
4724     $self->{nc} <= 0x0039) { # 0..9
4725    
4726     $self->{state} = NCR_NUM_STATE;
4727 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4728 wakaba 1.1
4729     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4730     $self->{line_prev} = $self->{line};
4731     $self->{column_prev} = $self->{column};
4732     $self->{column}++;
4733     $self->{nc}
4734     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4735     } else {
4736     $self->{set_nc}->($self);
4737     }
4738    
4739     redo A;
4740     } else {
4741     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4742     line => $self->{line_prev},
4743     column => $self->{column_prev} - 1);
4744    
4745     ## NOTE: According to the spec algorithm, nothing is returned,
4746     ## and then "&#" is appended to the parent element or the attribute
4747     ## value in the later processing.
4748    
4749     if ($self->{prev_state} == DATA_STATE) {
4750    
4751     $self->{state} = $self->{prev_state};
4752 wakaba 1.5 $self->{s_kwd} = '';
4753 wakaba 1.1 ## Reconsume.
4754     return ({type => CHARACTER_TOKEN,
4755     data => '&#',
4756     line => $self->{line_prev},
4757     column => $self->{column_prev} - 1,
4758     });
4759     redo A;
4760     } else {
4761    
4762     $self->{ca}->{value} .= '&#';
4763     $self->{state} = $self->{prev_state};
4764 wakaba 1.5 $self->{s_kwd} = '';
4765 wakaba 1.1 ## Reconsume.
4766     redo A;
4767     }
4768     }
4769     } elsif ($self->{state} == NCR_NUM_STATE) {
4770     if (0x0030 <= $self->{nc} and
4771     $self->{nc} <= 0x0039) { # 0..9
4772    
4773 wakaba 1.12 $self->{kwd} *= 10;
4774     $self->{kwd} += $self->{nc} - 0x0030;
4775 wakaba 1.1
4776     ## Stay in the state.
4777    
4778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4779     $self->{line_prev} = $self->{line};
4780     $self->{column_prev} = $self->{column};
4781     $self->{column}++;
4782     $self->{nc}
4783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4784     } else {
4785     $self->{set_nc}->($self);
4786     }
4787    
4788     redo A;
4789     } elsif ($self->{nc} == 0x003B) { # ;
4790    
4791    
4792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4793     $self->{line_prev} = $self->{line};
4794     $self->{column_prev} = $self->{column};
4795     $self->{column}++;
4796     $self->{nc}
4797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4798     } else {
4799     $self->{set_nc}->($self);
4800     }
4801    
4802     #
4803     } else {
4804    
4805     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4806     ## Reconsume.
4807     #
4808     }
4809    
4810 wakaba 1.12 my $code = $self->{kwd};
4811 wakaba 1.1 my $l = $self->{line_prev};
4812     my $c = $self->{column_prev};
4813     if ($charref_map->{$code}) {
4814    
4815     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4816     text => (sprintf 'U+%04X', $code),
4817     line => $l, column => $c);
4818     $code = $charref_map->{$code};
4819     } elsif ($code > 0x10FFFF) {
4820    
4821     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4822     text => (sprintf 'U-%08X', $code),
4823     line => $l, column => $c);
4824     $code = 0xFFFD;
4825     }
4826    
4827     if ($self->{prev_state} == DATA_STATE) {
4828    
4829     $self->{state} = $self->{prev_state};
4830 wakaba 1.5 $self->{s_kwd} = '';
4831 wakaba 1.1 ## Reconsume.
4832     return ({type => CHARACTER_TOKEN, data => chr $code,
4833 wakaba 1.7 has_reference => 1,
4834 wakaba 1.1 line => $l, column => $c,
4835     });
4836     redo A;
4837     } else {
4838    
4839     $self->{ca}->{value} .= chr $code;
4840     $self->{ca}->{has_reference} = 1;
4841     $self->{state} = $self->{prev_state};
4842 wakaba 1.5 $self->{s_kwd} = '';
4843 wakaba 1.1 ## Reconsume.
4844     redo A;
4845     }
4846     } elsif ($self->{state} == HEXREF_X_STATE) {
4847     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4848     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4849     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4850     # 0..9, A..F, a..f
4851    
4852     $self->{state} = HEXREF_HEX_STATE;
4853 wakaba 1.12 $self->{kwd} = 0;
4854 wakaba 1.1 ## Reconsume.
4855     redo A;
4856     } else {
4857     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4858     line => $self->{line_prev},
4859     column => $self->{column_prev} - 2);
4860    
4861     ## NOTE: According to the spec algorithm, nothing is returned,
4862     ## and then "&#" followed by "X" or "x" is appended to the parent
4863     ## element or the attribute value in the later processing.
4864    
4865     if ($self->{prev_state} == DATA_STATE) {
4866    
4867     $self->{state} = $self->{prev_state};
4868 wakaba 1.5 $self->{s_kwd} = '';
4869 wakaba 1.1 ## Reconsume.
4870     return ({type => CHARACTER_TOKEN,
4871 wakaba 1.12 data => '&' . $self->{kwd},
4872 wakaba 1.1 line => $self->{line_prev},
4873 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4874 wakaba 1.1 });
4875     redo A;
4876     } else {
4877    
4878 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4879 wakaba 1.1 $self->{state} = $self->{prev_state};
4880 wakaba 1.5 $self->{s_kwd} = '';
4881 wakaba 1.1 ## Reconsume.
4882     redo A;
4883     }
4884     }
4885     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4886     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4887     # 0..9
4888    
4889 wakaba 1.12 $self->{kwd} *= 0x10;
4890     $self->{kwd} += $self->{nc} - 0x0030;
4891 wakaba 1.1 ## Stay in the state.
4892    
4893     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4894     $self->{line_prev} = $self->{line};
4895     $self->{column_prev} = $self->{column};
4896     $self->{column}++;
4897     $self->{nc}
4898     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4899     } else {
4900     $self->{set_nc}->($self);
4901     }
4902    
4903     redo A;
4904     } elsif (0x0061 <= $self->{nc} and
4905     $self->{nc} <= 0x0066) { # a..f
4906    
4907 wakaba 1.12 $self->{kwd} *= 0x10;
4908     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4909 wakaba 1.1 ## Stay in the state.
4910    
4911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4912     $self->{line_prev} = $self->{line};
4913     $self->{column_prev} = $self->{column};
4914     $self->{column}++;
4915     $self->{nc}
4916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4917     } else {
4918     $self->{set_nc}->($self);
4919     }
4920    
4921     redo A;
4922     } elsif (0x0041 <= $self->{nc} and
4923     $self->{nc} <= 0x0046) { # A..F
4924    
4925 wakaba 1.12 $self->{kwd} *= 0x10;
4926     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4927 wakaba 1.1 ## Stay in the state.
4928    
4929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4930     $self->{line_prev} = $self->{line};
4931     $self->{column_prev} = $self->{column};
4932     $self->{column}++;
4933     $self->{nc}
4934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4935     } else {
4936     $self->{set_nc}->($self);
4937     }
4938    
4939     redo A;
4940     } elsif ($self->{nc} == 0x003B) { # ;
4941    
4942    
4943     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4944     $self->{line_prev} = $self->{line};
4945     $self->{column_prev} = $self->{column};
4946     $self->{column}++;
4947     $self->{nc}
4948     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4949     } else {
4950     $self->{set_nc}->($self);
4951     }
4952    
4953     #
4954     } else {
4955    
4956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4957     line => $self->{line},
4958     column => $self->{column});
4959     ## Reconsume.
4960     #
4961     }
4962    
4963 wakaba 1.12 my $code = $self->{kwd};
4964 wakaba 1.1 my $l = $self->{line_prev};
4965     my $c = $self->{column_prev};
4966     if ($charref_map->{$code}) {
4967    
4968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4969     text => (sprintf 'U+%04X', $code),
4970     line => $l, column => $c);
4971     $code = $charref_map->{$code};
4972     } elsif ($code > 0x10FFFF) {
4973    
4974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4975     text => (sprintf 'U-%08X', $code),
4976     line => $l, column => $c);
4977     $code = 0xFFFD;
4978     }
4979    
4980     if ($self->{prev_state} == DATA_STATE) {
4981    
4982     $self->{state} = $self->{prev_state};
4983 wakaba 1.5 $self->{s_kwd} = '';
4984 wakaba 1.1 ## Reconsume.
4985     return ({type => CHARACTER_TOKEN, data => chr $code,
4986 wakaba 1.7 has_reference => 1,
4987 wakaba 1.1 line => $l, column => $c,
4988     });
4989     redo A;
4990     } else {
4991    
4992     $self->{ca}->{value} .= chr $code;
4993     $self->{ca}->{has_reference} = 1;
4994     $self->{state} = $self->{prev_state};
4995 wakaba 1.5 $self->{s_kwd} = '';
4996 wakaba 1.1 ## Reconsume.
4997     redo A;
4998     }
4999     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5000 wakaba 1.12 if (length $self->{kwd} < 30 and
5001 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
5002     ((0x0041 <= $self->{nc} and # a
5003     $self->{nc} <= 0x005A) or # x
5004     (0x0061 <= $self->{nc} and # a
5005     $self->{nc} <= 0x007A) or # z
5006     (0x0030 <= $self->{nc} and # 0
5007     $self->{nc} <= 0x0039) or # 9
5008     $self->{nc} == 0x003B)) { # ;
5009     our $EntityChar;
5010 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5011     if (defined $EntityChar->{$self->{kwd}}) {
5012 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5013    
5014 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5015 wakaba 1.1 $self->{entity__match} = 1;
5016    
5017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5018     $self->{line_prev} = $self->{line};
5019     $self->{column_prev} = $self->{column};
5020     $self->{column}++;
5021     $self->{nc}
5022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5023     } else {
5024     $self->{set_nc}->($self);
5025     }
5026    
5027     #
5028     } else {
5029    
5030 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5031 wakaba 1.1 $self->{entity__match} = -1;
5032     ## Stay in the state.
5033    
5034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035     $self->{line_prev} = $self->{line};
5036     $self->{column_prev} = $self->{column};
5037     $self->{column}++;
5038     $self->{nc}
5039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040     } else {
5041     $self->{set_nc}->($self);
5042     }
5043    
5044     redo A;
5045     }
5046     } else {
5047    
5048     $self->{entity__value} .= chr $self->{nc};
5049     $self->{entity__match} *= 2;
5050     ## Stay in the state.
5051    
5052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5053     $self->{line_prev} = $self->{line};
5054     $self->{column_prev} = $self->{column};
5055     $self->{column}++;
5056     $self->{nc}
5057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5058     } else {
5059     $self->{set_nc}->($self);
5060     }
5061    
5062     redo A;
5063     }
5064     }
5065    
5066     my $data;
5067     my $has_ref;
5068     if ($self->{entity__match} > 0) {
5069    
5070     $data = $self->{entity__value};
5071     $has_ref = 1;
5072     #
5073     } elsif ($self->{entity__match} < 0) {
5074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5075     if ($self->{prev_state} != DATA_STATE and # in attribute
5076     $self->{entity__match} < -1) {
5077    
5078 wakaba 1.12 $data = '&' . $self->{kwd};
5079 wakaba 1.1 #
5080     } else {
5081    
5082     $data = $self->{entity__value};
5083     $has_ref = 1;
5084     #
5085     }
5086     } else {
5087    
5088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5089     line => $self->{line_prev},
5090 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5091     $data = '&' . $self->{kwd};
5092 wakaba 1.1 #
5093     }
5094    
5095     ## NOTE: In these cases, when a character reference is found,
5096     ## it is consumed and a character token is returned, or, otherwise,
5097     ## nothing is consumed and returned, according to the spec algorithm.
5098     ## In this implementation, anything that has been examined by the
5099     ## tokenizer is appended to the parent element or the attribute value
5100     ## as string, either literal string when no character reference or
5101     ## entity-replaced string otherwise, in this stage, since any characters
5102     ## that would not be consumed are appended in the data state or in an
5103     ## appropriate attribute value state anyway.
5104    
5105     if ($self->{prev_state} == DATA_STATE) {
5106    
5107     $self->{state} = $self->{prev_state};
5108 wakaba 1.5 $self->{s_kwd} = '';
5109 wakaba 1.1 ## Reconsume.
5110     return ({type => CHARACTER_TOKEN,
5111     data => $data,
5112 wakaba 1.7 has_reference => $has_ref,
5113 wakaba 1.1 line => $self->{line_prev},
5114 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5115 wakaba 1.1 });
5116     redo A;
5117     } else {
5118    
5119     $self->{ca}->{value} .= $data;
5120     $self->{ca}->{has_reference} = 1 if $has_ref;
5121     $self->{state} = $self->{prev_state};
5122 wakaba 1.5 $self->{s_kwd} = '';
5123 wakaba 1.1 ## Reconsume.
5124     redo A;
5125     }
5126 wakaba 1.8
5127     ## XML-only states
5128    
5129     } elsif ($self->{state} == PI_STATE) {
5130 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5131    
5132 wakaba 1.8 if ($is_space->{$self->{nc}} or
5133 wakaba 1.14 $self->{nc} == 0x003F or # ?
5134 wakaba 1.8 $self->{nc} == -1) {
5135 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5136     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5137     ## "DOCTYPE pi state": Parse error, switch to the "data
5138     ## state".
5139 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5140     line => $self->{line_prev},
5141     column => $self->{column_prev}
5142     - 1 * ($self->{nc} != -1));
5143     $self->{state} = BOGUS_COMMENT_STATE;
5144     ## Reconsume.
5145     $self->{ct} = {type => COMMENT_TOKEN,
5146     data => '?',
5147     line => $self->{line_prev},
5148     column => $self->{column_prev}
5149     - 1 * ($self->{nc} != -1),
5150     };
5151     redo A;
5152     } else {
5153 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5154 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5155     target => chr $self->{nc},
5156     data => '',
5157     line => $self->{line_prev},
5158     column => $self->{column_prev} - 1,
5159     };
5160     $self->{state} = PI_TARGET_STATE;
5161    
5162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5163     $self->{line_prev} = $self->{line};
5164     $self->{column_prev} = $self->{column};
5165     $self->{column}++;
5166     $self->{nc}
5167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5168     } else {
5169     $self->{set_nc}->($self);
5170     }
5171    
5172     redo A;
5173     }
5174     } elsif ($self->{state} == PI_TARGET_STATE) {
5175     if ($is_space->{$self->{nc}}) {
5176     $self->{state} = PI_TARGET_AFTER_STATE;
5177    
5178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5179     $self->{line_prev} = $self->{line};
5180     $self->{column_prev} = $self->{column};
5181     $self->{column}++;
5182     $self->{nc}
5183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5184     } else {
5185     $self->{set_nc}->($self);
5186     }
5187    
5188     redo A;
5189     } elsif ($self->{nc} == -1) {
5190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5191 wakaba 1.13 if ($self->{in_subset}) {
5192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5193     } else {
5194     $self->{state} = DATA_STATE;
5195     $self->{s_kwd} = '';
5196     }
5197 wakaba 1.8 ## Reconsume.
5198     return ($self->{ct}); # pi
5199     redo A;
5200     } elsif ($self->{nc} == 0x003F) { # ?
5201     $self->{state} = PI_AFTER_STATE;
5202    
5203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5204     $self->{line_prev} = $self->{line};
5205     $self->{column_prev} = $self->{column};
5206     $self->{column}++;
5207     $self->{nc}
5208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5209     } else {
5210     $self->{set_nc}->($self);
5211     }
5212    
5213     redo A;
5214     } else {
5215     ## XML5: typo ("tag name" -> "target")
5216     $self->{ct}->{target} .= chr $self->{nc}; # pi
5217    
5218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5219     $self->{line_prev} = $self->{line};
5220     $self->{column_prev} = $self->{column};
5221     $self->{column}++;
5222     $self->{nc}
5223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5224     } else {
5225     $self->{set_nc}->($self);
5226     }
5227    
5228     redo A;
5229     }
5230     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5231     if ($is_space->{$self->{nc}}) {
5232     ## Stay in the state.
5233    
5234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5235     $self->{line_prev} = $self->{line};
5236     $self->{column_prev} = $self->{column};
5237     $self->{column}++;
5238     $self->{nc}
5239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5240     } else {
5241     $self->{set_nc}->($self);
5242     }
5243    
5244     redo A;
5245     } else {
5246     $self->{state} = PI_DATA_STATE;
5247     ## Reprocess.
5248     redo A;
5249     }
5250     } elsif ($self->{state} == PI_DATA_STATE) {
5251     if ($self->{nc} == 0x003F) { # ?
5252     $self->{state} = PI_DATA_AFTER_STATE;
5253    
5254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5255     $self->{line_prev} = $self->{line};
5256     $self->{column_prev} = $self->{column};
5257     $self->{column}++;
5258     $self->{nc}
5259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5260     } else {
5261     $self->{set_nc}->($self);
5262     }
5263    
5264     redo A;
5265     } elsif ($self->{nc} == -1) {
5266     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5267 wakaba 1.13 if ($self->{in_subset}) {
5268 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5269 wakaba 1.13 } else {
5270     $self->{state} = DATA_STATE;
5271     $self->{s_kwd} = '';
5272     }
5273 wakaba 1.8 ## Reprocess.
5274     return ($self->{ct}); # pi
5275     redo A;
5276     } else {
5277     $self->{ct}->{data} .= chr $self->{nc}; # pi
5278     $self->{read_until}->($self->{ct}->{data}, q[?],
5279     length $self->{ct}->{data});
5280     ## Stay in the state.
5281    
5282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5283     $self->{line_prev} = $self->{line};
5284     $self->{column_prev} = $self->{column};
5285     $self->{column}++;
5286     $self->{nc}
5287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5288     } else {
5289     $self->{set_nc}->($self);
5290     }
5291    
5292     ## Reprocess.
5293     redo A;
5294     }
5295     } elsif ($self->{state} == PI_AFTER_STATE) {
5296 wakaba 1.14 ## XML5: Part of "Pi after state".
5297    
5298 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5299 wakaba 1.13 if ($self->{in_subset}) {
5300     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5301     } else {
5302     $self->{state} = DATA_STATE;
5303     $self->{s_kwd} = '';
5304     }
5305 wakaba 1.8
5306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5307     $self->{line_prev} = $self->{line};
5308     $self->{column_prev} = $self->{column};
5309     $self->{column}++;
5310     $self->{nc}
5311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5312     } else {
5313     $self->{set_nc}->($self);
5314     }
5315    
5316     return ($self->{ct}); # pi
5317     redo A;
5318     } elsif ($self->{nc} == 0x003F) { # ?
5319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5320     line => $self->{line_prev},
5321     column => $self->{column_prev}); ## XML5: no error
5322     $self->{ct}->{data} .= '?';
5323     $self->{state} = PI_DATA_AFTER_STATE;
5324    
5325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326     $self->{line_prev} = $self->{line};
5327     $self->{column_prev} = $self->{column};
5328     $self->{column}++;
5329     $self->{nc}
5330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331     } else {
5332     $self->{set_nc}->($self);
5333     }
5334    
5335     redo A;
5336     } else {
5337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5338     line => $self->{line_prev},
5339     column => $self->{column_prev}
5340     + 1 * ($self->{nc} == -1)); ## XML5: no error
5341     $self->{ct}->{data} .= '?'; ## XML5: not appended
5342     $self->{state} = PI_DATA_STATE;
5343     ## Reprocess.
5344     redo A;
5345     }
5346     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5347 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5348    
5349 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5350 wakaba 1.13 if ($self->{in_subset}) {
5351     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5352     } else {
5353     $self->{state} = DATA_STATE;
5354     $self->{s_kwd} = '';
5355     }
5356 wakaba 1.8
5357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5358     $self->{line_prev} = $self->{line};
5359     $self->{column_prev} = $self->{column};
5360     $self->{column}++;
5361     $self->{nc}
5362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5363     } else {
5364     $self->{set_nc}->($self);
5365     }
5366    
5367     return ($self->{ct}); # pi
5368     redo A;
5369     } elsif ($self->{nc} == 0x003F) { # ?
5370     $self->{ct}->{data} .= '?';
5371     ## Stay in the state.
5372    
5373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5374     $self->{line_prev} = $self->{line};
5375     $self->{column_prev} = $self->{column};
5376     $self->{column}++;
5377     $self->{nc}
5378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5379     } else {
5380     $self->{set_nc}->($self);
5381     }
5382    
5383     redo A;
5384     } else {
5385     $self->{ct}->{data} .= '?'; ## XML5: not appended
5386     $self->{state} = PI_DATA_STATE;
5387     ## Reprocess.
5388     redo A;
5389     }
5390 wakaba 1.12
5391     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5392     if ($self->{nc} == 0x003C) { # <
5393 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5394 wakaba 1.12
5395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396     $self->{line_prev} = $self->{line};
5397     $self->{column_prev} = $self->{column};
5398     $self->{column}++;
5399     $self->{nc}
5400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401     } else {
5402     $self->{set_nc}->($self);
5403     }
5404    
5405     redo A;
5406     } elsif ($self->{nc} == 0x0025) { # %
5407     ## XML5: Not defined yet.
5408    
5409     ## TODO:
5410    
5411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5412     $self->{line_prev} = $self->{line};
5413     $self->{column_prev} = $self->{column};
5414     $self->{column}++;
5415     $self->{nc}
5416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5417     } else {
5418     $self->{set_nc}->($self);
5419     }
5420    
5421     redo A;
5422     } elsif ($self->{nc} == 0x005D) { # ]
5423 wakaba 1.13 delete $self->{in_subset};
5424 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5425    
5426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5427     $self->{line_prev} = $self->{line};
5428     $self->{column_prev} = $self->{column};
5429     $self->{column}++;
5430     $self->{nc}
5431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5432     } else {
5433     $self->{set_nc}->($self);
5434     }
5435    
5436     redo A;
5437     } elsif ($is_space->{$self->{nc}}) {
5438     ## Stay in the state.
5439    
5440     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5441     $self->{line_prev} = $self->{line};
5442     $self->{column_prev} = $self->{column};
5443     $self->{column}++;
5444     $self->{nc}
5445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5446     } else {
5447     $self->{set_nc}->($self);
5448     }
5449    
5450     redo A;
5451     } elsif ($self->{nc} == -1) {
5452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5453 wakaba 1.13 delete $self->{in_subset};
5454 wakaba 1.12 $self->{state} = DATA_STATE;
5455     $self->{s_kwd} = '';
5456     ## Reconsume.
5457 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5458 wakaba 1.12 redo A;
5459     } else {
5460     unless ($self->{internal_subset_tainted}) {
5461     ## XML5: No parse error.
5462     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5463     $self->{internal_subset_tainted} = 1;
5464     }
5465     ## Stay in the state.
5466    
5467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5468     $self->{line_prev} = $self->{line};
5469     $self->{column_prev} = $self->{column};
5470     $self->{column}++;
5471     $self->{nc}
5472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5473     } else {
5474     $self->{set_nc}->($self);
5475     }
5476    
5477     redo A;
5478     }
5479     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5480     if ($self->{nc} == 0x003E) { # >
5481     $self->{state} = DATA_STATE;
5482     $self->{s_kwd} = '';
5483    
5484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485     $self->{line_prev} = $self->{line};
5486     $self->{column_prev} = $self->{column};
5487     $self->{column}++;
5488     $self->{nc}
5489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490     } else {
5491     $self->{set_nc}->($self);
5492     }
5493    
5494 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5495 wakaba 1.12 redo A;
5496     } elsif ($self->{nc} == -1) {
5497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5498     $self->{state} = DATA_STATE;
5499     $self->{s_kwd} = '';
5500     ## Reconsume.
5501 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5502 wakaba 1.12 redo A;
5503     } else {
5504     ## XML5: No parse error and stay in the state.
5505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5506    
5507 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5508    
5509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5510     $self->{line_prev} = $self->{line};
5511     $self->{column_prev} = $self->{column};
5512     $self->{column}++;
5513     $self->{nc}
5514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5515     } else {
5516     $self->{set_nc}->($self);
5517     }
5518    
5519     redo A;
5520     }
5521     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5522     if ($self->{nc} == 0x003E) { # >
5523     $self->{state} = DATA_STATE;
5524     $self->{s_kwd} = '';
5525    
5526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527     $self->{line_prev} = $self->{line};
5528     $self->{column_prev} = $self->{column};
5529     $self->{column}++;
5530     $self->{nc}
5531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532     } else {
5533     $self->{set_nc}->($self);
5534     }
5535    
5536     return ({type => END_OF_DOCTYPE_TOKEN});
5537     redo A;
5538     } elsif ($self->{nc} == -1) {
5539     $self->{state} = DATA_STATE;
5540     $self->{s_kwd} = '';
5541     ## Reconsume.
5542     return ({type => END_OF_DOCTYPE_TOKEN});
5543     redo A;
5544     } else {
5545     ## Stay in the state.
5546    
5547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5548     $self->{line_prev} = $self->{line};
5549     $self->{column_prev} = $self->{column};
5550     $self->{column}++;
5551     $self->{nc}
5552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5553     } else {
5554     $self->{set_nc}->($self);
5555     }
5556    
5557     redo A;
5558     }
5559     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5560     if ($self->{nc} == 0x0021) { # !
5561 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5562 wakaba 1.13
5563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5564     $self->{line_prev} = $self->{line};
5565     $self->{column_prev} = $self->{column};
5566     $self->{column}++;
5567     $self->{nc}
5568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5569     } else {
5570     $self->{set_nc}->($self);
5571     }
5572    
5573     redo A;
5574     } elsif ($self->{nc} == 0x003F) { # ?
5575     $self->{state} = PI_STATE;
5576    
5577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5578     $self->{line_prev} = $self->{line};
5579     $self->{column_prev} = $self->{column};
5580     $self->{column}++;
5581     $self->{nc}
5582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5583     } else {
5584     $self->{set_nc}->($self);
5585     }
5586    
5587     redo A;
5588     } elsif ($self->{nc} == -1) {
5589     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5590     $self->{state} = DATA_STATE;
5591     $self->{s_kwd} = '';
5592     ## Reconsume.
5593     redo A;
5594     } else {
5595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5596     line => $self->{line_prev},
5597     column => $self->{column_prev});
5598     $self->{state} = BOGUS_COMMENT_STATE;
5599     $self->{ct} = {type => COMMENT_TOKEN,
5600     data => '',
5601     }; ## NOTE: Will be discarded.
5602 wakaba 1.12
5603     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5604     $self->{line_prev} = $self->{line};
5605     $self->{column_prev} = $self->{column};
5606     $self->{column}++;
5607     $self->{nc}
5608     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5609     } else {
5610     $self->{set_nc}->($self);
5611     }
5612    
5613     redo A;
5614     }
5615 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5616     ## XML5: "DOCTYPE markup declaration state".
5617    
5618     if ($self->{nc} == 0x002D) { # -
5619     $self->{state} = MD_HYPHEN_STATE;
5620    
5621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5622     $self->{line_prev} = $self->{line};
5623     $self->{column_prev} = $self->{column};
5624     $self->{column}++;
5625     $self->{nc}
5626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5627     } else {
5628     $self->{set_nc}->($self);
5629     }
5630    
5631     redo A;
5632 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5633     $self->{nc} == 0x0065) { # e
5634 wakaba 1.14 $self->{state} = MD_E_STATE;
5635     $self->{kwd} = chr $self->{nc};
5636    
5637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5638     $self->{line_prev} = $self->{line};
5639     $self->{column_prev} = $self->{column};
5640     $self->{column}++;
5641     $self->{nc}
5642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5643     } else {
5644     $self->{set_nc}->($self);
5645     }
5646    
5647     redo A;
5648 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5649     $self->{nc} == 0x0061) { # a
5650 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5651     $self->{kwd} = chr $self->{nc};
5652    
5653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654     $self->{line_prev} = $self->{line};
5655     $self->{column_prev} = $self->{column};
5656     $self->{column}++;
5657     $self->{nc}
5658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659     } else {
5660     $self->{set_nc}->($self);
5661     }
5662    
5663     redo A;
5664 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5665     $self->{nc} == 0x006E) { # n
5666 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5667     $self->{kwd} = chr $self->{nc};
5668    
5669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5670     $self->{line_prev} = $self->{line};
5671     $self->{column_prev} = $self->{column};
5672     $self->{column}++;
5673     $self->{nc}
5674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5675     } else {
5676     $self->{set_nc}->($self);
5677     }
5678    
5679     redo A;
5680     } else {
5681     #
5682     }
5683    
5684     ## XML5: No parse error.
5685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5686     line => $self->{line_prev},
5687     column => $self->{column_prev} - 1);
5688     ## Reconsume.
5689     $self->{state} = BOGUS_COMMENT_STATE;
5690     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5691     redo A;
5692     } elsif ($self->{state} == MD_E_STATE) {
5693 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5694     $self->{nc} == 0x006E) { # n
5695 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5696     $self->{kwd} .= chr $self->{nc};
5697    
5698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5699     $self->{line_prev} = $self->{line};
5700     $self->{column_prev} = $self->{column};
5701     $self->{column}++;
5702     $self->{nc}
5703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5704     } else {
5705     $self->{set_nc}->($self);
5706     }
5707    
5708     redo A;
5709 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5710     $self->{nc} == 0x006C) { # l
5711 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5712     $self->{state} = MD_ELEMENT_STATE;
5713     $self->{kwd} .= chr $self->{nc};
5714    
5715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5716     $self->{line_prev} = $self->{line};
5717     $self->{column_prev} = $self->{column};
5718     $self->{column}++;
5719     $self->{nc}
5720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5721     } else {
5722     $self->{set_nc}->($self);
5723     }
5724    
5725     redo A;
5726     } else {
5727     ## XML5: No parse error.
5728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5729     line => $self->{line_prev},
5730     column => $self->{column_prev} - 2
5731     + 1 * ($self->{nc} == -1));
5732     ## Reconsume.
5733     $self->{state} = BOGUS_COMMENT_STATE;
5734     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5735     redo A;
5736     }
5737     } elsif ($self->{state} == MD_ENTITY_STATE) {
5738 wakaba 1.17 if ($self->{nc} == [
5739     undef,
5740     undef,
5741     0x0054, # T
5742     0x0049, # I
5743     0x0054, # T
5744     ]->[length $self->{kwd}] or
5745     $self->{nc} == [
5746     undef,
5747     undef,
5748     0x0074, # t
5749     0x0069, # i
5750     0x0074, # t
5751     ]->[length $self->{kwd}]) {
5752 wakaba 1.14 ## Stay in the state.
5753     $self->{kwd} .= chr $self->{nc};
5754    
5755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5756     $self->{line_prev} = $self->{line};
5757     $self->{column_prev} = $self->{column};
5758     $self->{column}++;
5759     $self->{nc}
5760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5761     } else {
5762     $self->{set_nc}->($self);
5763     }
5764    
5765     redo A;
5766 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5767     ($self->{nc} == 0x0059 or # Y
5768     $self->{nc} == 0x0079)) { # y
5769     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5770     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5771     text => 'ENTITY',
5772     line => $self->{line_prev},
5773     column => $self->{column_prev} - 4);
5774     }
5775     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5776 wakaba 1.14 line => $self->{line_prev},
5777     column => $self->{column_prev} - 6};
5778     $self->{state} = DOCTYPE_MD_STATE;
5779    
5780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5781     $self->{line_prev} = $self->{line};
5782     $self->{column_prev} = $self->{column};
5783     $self->{column}++;
5784     $self->{nc}
5785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5786     } else {
5787     $self->{set_nc}->($self);
5788     }
5789    
5790     redo A;
5791     } else {
5792     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5793     line => $self->{line_prev},
5794     column => $self->{column_prev} - 1
5795     - (length $self->{kwd})
5796     + 1 * ($self->{nc} == -1));
5797     $self->{state} = BOGUS_COMMENT_STATE;
5798     ## Reconsume.
5799     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5800     redo A;
5801     }
5802     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5803 wakaba 1.17 if ($self->{nc} == [
5804     undef,
5805     undef,
5806     0x0045, # E
5807     0x004D, # M
5808     0x0045, # E
5809     0x004E, # N
5810     ]->[length $self->{kwd}] or
5811     $self->{nc} == [
5812     undef,
5813     undef,
5814     0x0065, # e
5815     0x006D, # m
5816     0x0065, # e
5817     0x006E, # n
5818     ]->[length $self->{kwd}]) {
5819 wakaba 1.14 ## Stay in the state.
5820     $self->{kwd} .= chr $self->{nc};
5821    
5822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5823     $self->{line_prev} = $self->{line};
5824     $self->{column_prev} = $self->{column};
5825     $self->{column}++;
5826     $self->{nc}
5827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5828     } else {
5829     $self->{set_nc}->($self);
5830     }
5831    
5832     redo A;
5833 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5834     ($self->{nc} == 0x0054 or # T
5835     $self->{nc} == 0x0074)) { # t
5836     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5838     text => 'ELEMENT',
5839     line => $self->{line_prev},
5840     column => $self->{column_prev} - 5);
5841     }
5842 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5843     line => $self->{line_prev},
5844     column => $self->{column_prev} - 6};
5845     $self->{state} = DOCTYPE_MD_STATE;
5846    
5847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5848     $self->{line_prev} = $self->{line};
5849     $self->{column_prev} = $self->{column};
5850     $self->{column}++;
5851     $self->{nc}
5852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5853     } else {
5854     $self->{set_nc}->($self);
5855     }
5856    
5857     redo A;
5858     } else {
5859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5860     line => $self->{line_prev},
5861     column => $self->{column_prev} - 1
5862     - (length $self->{kwd})
5863     + 1 * ($self->{nc} == -1));
5864     $self->{state} = BOGUS_COMMENT_STATE;
5865     ## Reconsume.
5866     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5867     redo A;
5868     }
5869     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5870 wakaba 1.17 if ($self->{nc} == [
5871     undef,
5872     0x0054, # T
5873     0x0054, # T
5874     0x004C, # L
5875     0x0049, # I
5876     0x0053, # S
5877     ]->[length $self->{kwd}] or
5878     $self->{nc} == [
5879     undef,
5880     0x0074, # t
5881     0x0074, # t
5882     0x006C, # l
5883     0x0069, # i
5884     0x0073, # s
5885     ]->[length $self->{kwd}]) {
5886 wakaba 1.14 ## Stay in the state.
5887     $self->{kwd} .= chr $self->{nc};
5888    
5889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890     $self->{line_prev} = $self->{line};
5891     $self->{column_prev} = $self->{column};
5892     $self->{column}++;
5893     $self->{nc}
5894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895     } else {
5896     $self->{set_nc}->($self);
5897     }
5898    
5899     redo A;
5900 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5901     ($self->{nc} == 0x0054 or # T
5902     $self->{nc} == 0x0074)) { # t
5903     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5904     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5905     text => 'ATTLIST',
5906     line => $self->{line_prev},
5907     column => $self->{column_prev} - 5);
5908     }
5909 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5910 wakaba 1.15 attrdefs => [],
5911 wakaba 1.14 line => $self->{line_prev},
5912     column => $self->{column_prev} - 6};
5913     $self->{state} = DOCTYPE_MD_STATE;
5914    
5915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5916     $self->{line_prev} = $self->{line};
5917     $self->{column_prev} = $self->{column};
5918     $self->{column}++;
5919     $self->{nc}
5920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5921     } else {
5922     $self->{set_nc}->($self);
5923     }
5924    
5925     redo A;
5926     } else {
5927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5928     line => $self->{line_prev},
5929     column => $self->{column_prev} - 1
5930     - (length $self->{kwd})
5931     + 1 * ($self->{nc} == -1));
5932     $self->{state} = BOGUS_COMMENT_STATE;
5933     ## Reconsume.
5934     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5935     redo A;
5936     }
5937     } elsif ($self->{state} == MD_NOTATION_STATE) {
5938 wakaba 1.17 if ($self->{nc} == [
5939     undef,
5940     0x004F, # O
5941     0x0054, # T
5942     0x0041, # A
5943     0x0054, # T
5944     0x0049, # I
5945     0x004F, # O
5946     ]->[length $self->{kwd}] or
5947     $self->{nc} == [
5948     undef,
5949     0x006F, # o
5950     0x0074, # t
5951     0x0061, # a
5952     0x0074, # t
5953     0x0069, # i
5954     0x006F, # o
5955     ]->[length $self->{kwd}]) {
5956 wakaba 1.14 ## Stay in the state.
5957     $self->{kwd} .= chr $self->{nc};
5958    
5959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5960     $self->{line_prev} = $self->{line};
5961     $self->{column_prev} = $self->{column};
5962     $self->{column}++;
5963     $self->{nc}
5964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5965     } else {
5966     $self->{set_nc}->($self);
5967     }
5968    
5969     redo A;
5970 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
5971     ($self->{nc} == 0x004E or # N
5972     $self->{nc} == 0x006E)) { # n
5973     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5975     text => 'NOTATION',
5976     line => $self->{line_prev},
5977     column => $self->{column_prev} - 6);
5978     }
5979 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
5980     line => $self->{line_prev},
5981     column => $self->{column_prev} - 6};
5982     $self->{state} = DOCTYPE_MD_STATE;
5983    
5984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5985     $self->{line_prev} = $self->{line};
5986     $self->{column_prev} = $self->{column};
5987     $self->{column}++;
5988     $self->{nc}
5989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5990     } else {
5991     $self->{set_nc}->($self);
5992     }
5993    
5994     redo A;
5995     } else {
5996     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5997     line => $self->{line_prev},
5998     column => $self->{column_prev} - 1
5999     - (length $self->{kwd})
6000     + 1 * ($self->{nc} == -1));
6001     $self->{state} = BOGUS_COMMENT_STATE;
6002     ## Reconsume.
6003     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6004     redo A;
6005     }
6006     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6007     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6008     ## "DOCTYPE NOTATION state".
6009    
6010     if ($is_space->{$self->{nc}}) {
6011     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6012     $self->{state} = BEFORE_MD_NAME_STATE;
6013    
6014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015     $self->{line_prev} = $self->{line};
6016     $self->{column_prev} = $self->{column};
6017     $self->{column}++;
6018     $self->{nc}
6019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020     } else {
6021     $self->{set_nc}->($self);
6022     }
6023    
6024     redo A;
6025     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6026     $self->{nc} == 0x0025) { # %
6027     ## XML5: Switch to the "DOCTYPE bogus comment state".
6028     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6029     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6030    
6031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032     $self->{line_prev} = $self->{line};
6033     $self->{column_prev} = $self->{column};
6034     $self->{column}++;
6035     $self->{nc}
6036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037     } else {
6038     $self->{set_nc}->($self);
6039     }
6040    
6041     redo A;
6042     } elsif ($self->{nc} == -1) {
6043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6044     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6045     ## Reconsume.
6046     redo A;
6047     } elsif ($self->{nc} == 0x003E) { # >
6048     ## XML5: Switch to the "DOCTYPE bogus comment state".
6049     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6050     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6051    
6052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6053     $self->{line_prev} = $self->{line};
6054     $self->{column_prev} = $self->{column};
6055     $self->{column}++;
6056     $self->{nc}
6057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6058     } else {
6059     $self->{set_nc}->($self);
6060     }
6061    
6062     redo A;
6063     } else {
6064     ## XML5: Switch to the "DOCTYPE bogus comment state".
6065     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6066     $self->{state} = BEFORE_MD_NAME_STATE;
6067     redo A;
6068     }
6069     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6070     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6071     ## before state", "DOCTYPE ATTLIST name before state".
6072    
6073     if ($is_space->{$self->{nc}}) {
6074     ## Stay in the state.
6075    
6076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6077     $self->{line_prev} = $self->{line};
6078     $self->{column_prev} = $self->{column};
6079     $self->{column}++;
6080     $self->{nc}
6081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6082     } else {
6083     $self->{set_nc}->($self);
6084     }
6085    
6086     redo A;
6087     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6088     $self->{nc} == 0x0025) { # %
6089     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6090    
6091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6092     $self->{line_prev} = $self->{line};
6093     $self->{column_prev} = $self->{column};
6094     $self->{column}++;
6095     $self->{nc}
6096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6097     } else {
6098     $self->{set_nc}->($self);
6099     }
6100    
6101     redo A;
6102     } elsif ($self->{nc} == 0x003E) { # >
6103     ## XML5: Same as "Anything else".
6104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6105     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6106    
6107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6108     $self->{line_prev} = $self->{line};
6109     $self->{column_prev} = $self->{column};
6110     $self->{column}++;
6111     $self->{nc}
6112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6113     } else {
6114     $self->{set_nc}->($self);
6115     }
6116    
6117     redo A;
6118     } elsif ($self->{nc} == -1) {
6119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6120     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6121     ## Reconsume.
6122     redo A;
6123     } else {
6124     ## XML5: [ATTLIST] Not defined yet.
6125     $self->{ct}->{name} .= chr $self->{nc};
6126     $self->{state} = MD_NAME_STATE;
6127    
6128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129     $self->{line_prev} = $self->{line};
6130     $self->{column_prev} = $self->{column};
6131     $self->{column}++;
6132     $self->{nc}
6133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134     } else {
6135     $self->{set_nc}->($self);
6136     }
6137    
6138     redo A;
6139     }
6140     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6141     if ($is_space->{$self->{nc}}) {
6142     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6143     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6144     $self->{state} = BEFORE_MD_NAME_STATE;
6145 wakaba 1.8
6146 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6147     $self->{line_prev} = $self->{line};
6148     $self->{column_prev} = $self->{column};
6149     $self->{column}++;
6150     $self->{nc}
6151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6152     } else {
6153     $self->{set_nc}->($self);
6154     }
6155    
6156     redo A;
6157     } elsif ($self->{nc} == 0x003E) { # >
6158     ## XML5: Same as "Anything else".
6159     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6160     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6161    
6162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6163     $self->{line_prev} = $self->{line};
6164     $self->{column_prev} = $self->{column};
6165     $self->{column}++;
6166     $self->{nc}
6167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6168     } else {
6169     $self->{set_nc}->($self);
6170     }
6171    
6172     redo A;
6173     } elsif ($self->{nc} == -1) {
6174     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6175     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6176     ## Reconsume.
6177     redo A;
6178     } else {
6179     ## XML5: No parse error.
6180     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6181     $self->{state} = BOGUS_COMMENT_STATE;
6182     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6183     ## Reconsume.
6184     redo A;
6185     }
6186     } elsif ($self->{state} == MD_NAME_STATE) {
6187     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6188    
6189     if ($is_space->{$self->{nc}}) {
6190 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6191     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6192     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6193 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6194 wakaba 1.16 } else { # ENTITY/NOTATION
6195     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6196     }
6197 wakaba 1.14
6198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6199     $self->{line_prev} = $self->{line};
6200     $self->{column_prev} = $self->{column};
6201     $self->{column}++;
6202     $self->{nc}
6203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6204     } else {
6205     $self->{set_nc}->($self);
6206     }
6207    
6208     redo A;
6209     } elsif ($self->{nc} == 0x003E) { # >
6210     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6211     #
6212     } else {
6213 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6214 wakaba 1.14 }
6215     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6216    
6217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6218     $self->{line_prev} = $self->{line};
6219     $self->{column_prev} = $self->{column};
6220     $self->{column}++;
6221     $self->{nc}
6222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6223     } else {
6224     $self->{set_nc}->($self);
6225     }
6226    
6227     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6228     redo A;
6229     } elsif ($self->{nc} == -1) {
6230     ## XML5: [ATTLIST] No parse error.
6231     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6232     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6233     ## Reconsume.
6234     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6235     redo A;
6236     } else {
6237     ## XML5: [ATTLIST] Not defined yet.
6238     $self->{ct}->{name} .= chr $self->{nc};
6239     ## Stay in the state.
6240    
6241     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6242     $self->{line_prev} = $self->{line};
6243     $self->{column_prev} = $self->{column};
6244     $self->{column}++;
6245     $self->{nc}
6246     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6247     } else {
6248     $self->{set_nc}->($self);
6249     }
6250    
6251     redo A;
6252     }
6253     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6254     if ($is_space->{$self->{nc}}) {
6255     ## Stay in the state.
6256    
6257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6258     $self->{line_prev} = $self->{line};
6259     $self->{column_prev} = $self->{column};
6260     $self->{column}++;
6261     $self->{nc}
6262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6263     } else {
6264     $self->{set_nc}->($self);
6265     }
6266    
6267     redo A;
6268     } elsif ($self->{nc} == 0x003E) { # >
6269     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6270    
6271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6272     $self->{line_prev} = $self->{line};
6273     $self->{column_prev} = $self->{column};
6274     $self->{column}++;
6275     $self->{nc}
6276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6277     } else {
6278     $self->{set_nc}->($self);
6279     }
6280    
6281     return ($self->{ct}); # ATTLIST
6282     redo A;
6283     } elsif ($self->{nc} == -1) {
6284     ## XML5: No parse error.
6285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6286     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6287 wakaba 1.15 return ($self->{ct});
6288 wakaba 1.14 redo A;
6289     } else {
6290     ## XML5: Not defined yet.
6291 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6292     tokens => [],
6293     line => $self->{line}, column => $self->{column}};
6294     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6295    
6296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6297     $self->{line_prev} = $self->{line};
6298     $self->{column_prev} = $self->{column};
6299     $self->{column}++;
6300     $self->{nc}
6301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6302     } else {
6303     $self->{set_nc}->($self);
6304     }
6305    
6306     redo A;
6307     }
6308     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6309     if ($is_space->{$self->{nc}}) {
6310     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6311    
6312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313     $self->{line_prev} = $self->{line};
6314     $self->{column_prev} = $self->{column};
6315     $self->{column}++;
6316     $self->{nc}
6317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318     } else {
6319     $self->{set_nc}->($self);
6320     }
6321    
6322     redo A;
6323     } elsif ($self->{nc} == 0x003E) { # >
6324     ## XML5: Same as "anything else".
6325     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6326     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6327    
6328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6329     $self->{line_prev} = $self->{line};
6330     $self->{column_prev} = $self->{column};
6331     $self->{column}++;
6332     $self->{nc}
6333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6334     } else {
6335     $self->{set_nc}->($self);
6336     }
6337    
6338     return ($self->{ct}); # ATTLIST
6339     redo A;
6340     } elsif ($self->{nc} == 0x0028) { # (
6341     ## XML5: Same as "anything else".
6342     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6343     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6344    
6345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6346     $self->{line_prev} = $self->{line};
6347     $self->{column_prev} = $self->{column};
6348     $self->{column}++;
6349     $self->{nc}
6350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6351     } else {
6352     $self->{set_nc}->($self);
6353     }
6354    
6355     redo A;
6356     } elsif ($self->{nc} == -1) {
6357     ## XML5: No parse error.
6358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6359     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6360    
6361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6362     $self->{line_prev} = $self->{line};
6363     $self->{column_prev} = $self->{column};
6364     $self->{column}++;
6365     $self->{nc}
6366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6367     } else {
6368     $self->{set_nc}->($self);
6369     }
6370    
6371     return ($self->{ct}); # ATTLIST
6372     redo A;
6373     } else {
6374     ## XML5: Not defined yet.
6375     $self->{ca}->{name} .= chr $self->{nc};
6376     ## Stay in the state.
6377    
6378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6379     $self->{line_prev} = $self->{line};
6380     $self->{column_prev} = $self->{column};
6381     $self->{column}++;
6382     $self->{nc}
6383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6384     } else {
6385     $self->{set_nc}->($self);
6386     }
6387    
6388 wakaba 1.14 redo A;
6389     }
6390 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6391     if ($is_space->{$self->{nc}}) {
6392     ## Stay in the state.
6393    
6394     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6395     $self->{line_prev} = $self->{line};
6396     $self->{column_prev} = $self->{column};
6397     $self->{column}++;
6398     $self->{nc}
6399     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6400     } else {
6401     $self->{set_nc}->($self);
6402     }
6403    
6404     redo A;
6405     } elsif ($self->{nc} == 0x003E) { # >
6406     ## XML5: Same as "anything else".
6407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6408     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6409    
6410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6411     $self->{line_prev} = $self->{line};
6412     $self->{column_prev} = $self->{column};
6413     $self->{column}++;
6414     $self->{nc}
6415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6416     } else {
6417     $self->{set_nc}->($self);
6418     }
6419    
6420     return ($self->{ct}); # ATTLIST
6421     redo A;
6422     } elsif ($self->{nc} == 0x0028) { # (
6423     ## XML5: Same as "anything else".
6424     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6425    
6426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6427     $self->{line_prev} = $self->{line};
6428     $self->{column_prev} = $self->{column};
6429     $self->{column}++;
6430     $self->{nc}
6431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6432     } else {
6433     $self->{set_nc}->($self);
6434     }
6435    
6436     redo A;
6437     } elsif ($self->{nc} == -1) {
6438     ## XML5: No parse error.
6439     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6440     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6441    
6442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6443     $self->{line_prev} = $self->{line};
6444     $self->{column_prev} = $self->{column};
6445     $self->{column}++;
6446     $self->{nc}
6447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6448     } else {
6449     $self->{set_nc}->($self);
6450     }
6451    
6452     return ($self->{ct});
6453     redo A;
6454     } else {
6455     ## XML5: Not defined yet.
6456     $self->{ca}->{type} = chr $self->{nc};
6457     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6458    
6459     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6460     $self->{line_prev} = $self->{line};
6461     $self->{column_prev} = $self->{column};
6462     $self->{column}++;
6463     $self->{nc}
6464     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6465     } else {
6466     $self->{set_nc}->($self);
6467     }
6468    
6469     redo A;
6470     }
6471     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6472     if ($is_space->{$self->{nc}}) {
6473     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6474    
6475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6476     $self->{line_prev} = $self->{line};
6477     $self->{column_prev} = $self->{column};
6478     $self->{column}++;
6479     $self->{nc}
6480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6481     } else {
6482     $self->{set_nc}->($self);
6483     }
6484    
6485     redo A;
6486     } elsif ($self->{nc} == 0x0023) { # #
6487     ## XML5: Same as "anything else".
6488     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6489     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6490    
6491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6492     $self->{line_prev} = $self->{line};
6493     $self->{column_prev} = $self->{column};
6494     $self->{column}++;
6495     $self->{nc}
6496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6497     } else {
6498     $self->{set_nc}->($self);
6499     }
6500    
6501     redo A;
6502     } elsif ($self->{nc} == 0x0022) { # "
6503     ## XML5: Same as "anything else".
6504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6505     $self->{ca}->{value} = '';
6506     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6507    
6508     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6509     $self->{line_prev} = $self->{line};
6510     $self->{column_prev} = $self->{column};
6511     $self->{column}++;
6512     $self->{nc}
6513     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6514     } else {
6515     $self->{set_nc}->($self);
6516     }
6517    
6518     redo A;
6519     } elsif ($self->{nc} == 0x0027) { # '
6520     ## XML5: Same as "anything else".
6521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6522     $self->{ca}->{value} = '';
6523     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6524    
6525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6526     $self->{line_prev} = $self->{line};
6527     $self->{column_prev} = $self->{column};
6528     $self->{column}++;
6529     $self->{nc}
6530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6531     } else {
6532     $self->{set_nc}->($self);
6533     }
6534    
6535     redo A;
6536     } elsif ($self->{nc} == 0x003E) { # >
6537     ## XML5: Same as "anything else".
6538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6540    
6541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6542     $self->{line_prev} = $self->{line};
6543     $self->{column_prev} = $self->{column};
6544     $self->{column}++;
6545     $self->{nc}
6546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6547     } else {
6548     $self->{set_nc}->($self);
6549     }
6550    
6551     return ($self->{ct}); # ATTLIST
6552     redo A;
6553     } elsif ($self->{nc} == 0x0028) { # (
6554     ## XML5: Same as "anything else".
6555     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6556     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6557    
6558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6559     $self->{line_prev} = $self->{line};
6560     $self->{column_prev} = $self->{column};
6561     $self->{column}++;
6562     $self->{nc}
6563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6564     } else {
6565     $self->{set_nc}->($self);
6566     }
6567    
6568     redo A;
6569     } elsif ($self->{nc} == -1) {
6570     ## XML5: No parse error.
6571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6572     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6573    
6574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575     $self->{line_prev} = $self->{line};
6576     $self->{column_prev} = $self->{column};
6577     $self->{column}++;
6578     $self->{nc}
6579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580     } else {
6581     $self->{set_nc}->($self);
6582     }
6583    
6584     return ($self->{ct});
6585     redo A;
6586     } else {
6587     ## XML5: Not defined yet.
6588     $self->{ca}->{type} .= chr $self->{nc};
6589     ## Stay in the state.
6590    
6591     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6592     $self->{line_prev} = $self->{line};
6593     $self->{column_prev} = $self->{column};
6594     $self->{column}++;
6595     $self->{nc}
6596     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6597     } else {
6598     $self->{set_nc}->($self);
6599     }
6600    
6601     redo A;
6602     }
6603     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6604     if ($is_space->{$self->{nc}}) {
6605     ## Stay in the state.
6606    
6607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6608     $self->{line_prev} = $self->{line};
6609     $self->{column_prev} = $self->{column};
6610     $self->{column}++;
6611     $self->{nc}
6612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6613     } else {
6614     $self->{set_nc}->($self);
6615     }
6616    
6617     redo A;
6618     } elsif ($self->{nc} == 0x0028) { # (
6619     ## XML5: Same as "anything else".
6620     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6621    
6622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6623     $self->{line_prev} = $self->{line};
6624     $self->{column_prev} = $self->{column};
6625     $self->{column}++;
6626     $self->{nc}
6627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6628     } else {
6629     $self->{set_nc}->($self);
6630     }
6631    
6632     redo A;
6633     } elsif ($self->{nc} == 0x0023) { # #
6634     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6635    
6636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6637     $self->{line_prev} = $self->{line};
6638     $self->{column_prev} = $self->{column};
6639     $self->{column}++;
6640     $self->{nc}
6641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6642     } else {
6643     $self->{set_nc}->($self);
6644     }
6645    
6646     redo A;
6647     } elsif ($self->{nc} == 0x0022) { # "
6648     ## XML5: Same as "anything else".
6649     $self->{ca}->{value} = '';
6650     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6651    
6652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6653     $self->{line_prev} = $self->{line};
6654     $self->{column_prev} = $self->{column};
6655     $self->{column}++;
6656     $self->{nc}
6657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6658     } else {
6659     $self->{set_nc}->($self);
6660     }
6661    
6662     redo A;
6663     } elsif ($self->{nc} == 0x0027) { # '
6664     ## XML5: Same as "anything else".
6665     $self->{ca}->{value} = '';
6666     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6667    
6668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6669     $self->{line_prev} = $self->{line};
6670     $self->{column_prev} = $self->{column};
6671     $self->{column}++;
6672     $self->{nc}
6673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6674     } else {
6675     $self->{set_nc}->($self);
6676     }
6677    
6678     redo A;
6679     } elsif ($self->{nc} == 0x003E) { # >
6680     ## XML5: Same as "anything else".
6681     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6682     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6683    
6684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6685     $self->{line_prev} = $self->{line};
6686     $self->{column_prev} = $self->{column};
6687     $self->{column}++;
6688     $self->{nc}
6689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6690     } else {
6691     $self->{set_nc}->($self);
6692     }
6693    
6694     return ($self->{ct}); # ATTLIST
6695     redo A;
6696     } elsif ($self->{nc} == -1) {
6697     ## XML5: No parse error.
6698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6699     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6700    
6701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6702     $self->{line_prev} = $self->{line};
6703     $self->{column_prev} = $self->{column};
6704     $self->{column}++;
6705     $self->{nc}
6706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6707     } else {
6708     $self->{set_nc}->($self);
6709     }
6710    
6711     return ($self->{ct});
6712     redo A;
6713     } else {
6714     ## XML5: Switch to the "DOCTYPE bogus comment state".
6715     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6716     $self->{ca}->{value} = '';
6717     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6718     ## Reconsume.
6719     redo A;
6720     }
6721     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6722     if ($is_space->{$self->{nc}}) {
6723     ## Stay in the state.
6724    
6725     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6726     $self->{line_prev} = $self->{line};
6727     $self->{column_prev} = $self->{column};
6728     $self->{column}++;
6729     $self->{nc}
6730     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6731     } else {
6732     $self->{set_nc}->($self);
6733     }
6734    
6735     redo A;
6736     } elsif ($self->{nc} == 0x007C) { # |
6737     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6738     ## Stay in the state.
6739    
6740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6741     $self->{line_prev} = $self->{line};
6742     $self->{column_prev} = $self->{column};
6743     $self->{column}++;
6744     $self->{nc}
6745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6746     } else {
6747     $self->{set_nc}->($self);
6748     }
6749    
6750     redo A;
6751     } elsif ($self->{nc} == 0x0029) { # )
6752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6753     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6754    
6755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6756     $self->{line_prev} = $self->{line};
6757     $self->{column_prev} = $self->{column};
6758     $self->{column}++;
6759     $self->{nc}
6760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6761     } else {
6762     $self->{set_nc}->($self);
6763     }
6764    
6765     redo A;
6766     } elsif ($self->{nc} == 0x003E) { # >
6767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6768     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6769    
6770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6771     $self->{line_prev} = $self->{line};
6772     $self->{column_prev} = $self->{column};
6773     $self->{column}++;
6774     $self->{nc}
6775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6776     } else {
6777     $self->{set_nc}->($self);
6778     }
6779    
6780     return ($self->{ct}); # ATTLIST
6781     redo A;
6782     } elsif ($self->{nc} == -1) {
6783     ## XML5: No parse error.
6784     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6785     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6786    
6787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788     $self->{line_prev} = $self->{line};
6789     $self->{column_prev} = $self->{column};
6790     $self->{column}++;
6791     $self->{nc}
6792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793     } else {
6794     $self->{set_nc}->($self);
6795     }
6796    
6797     return ($self->{ct});
6798     redo A;
6799     } else {
6800     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6801     $self->{state} = ALLOWED_TOKEN_STATE;
6802    
6803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804     $self->{line_prev} = $self->{line};
6805     $self->{column_prev} = $self->{column};
6806     $self->{column}++;
6807     $self->{nc}
6808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809     } else {
6810     $self->{set_nc}->($self);
6811     }
6812    
6813     redo A;
6814     }
6815     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6816     if ($is_space->{$self->{nc}}) {
6817     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6818    
6819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6820     $self->{line_prev} = $self->{line};
6821     $self->{column_prev} = $self->{column};
6822     $self->{column}++;
6823     $self->{nc}
6824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6825     } else {
6826     $self->{set_nc}->($self);
6827     }
6828    
6829     redo A;
6830     } elsif ($self->{nc} == 0x007C) { # |
6831     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6832    
6833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6834     $self->{line_prev} = $self->{line};
6835     $self->{column_prev} = $self->{column};
6836     $self->{column}++;
6837     $self->{nc}
6838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6839     } else {
6840     $self->{set_nc}->($self);
6841     }
6842    
6843     redo A;
6844     } elsif ($self->{nc} == 0x0029) { # )
6845     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6846    
6847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6848     $self->{line_prev} = $self->{line};
6849     $self->{column_prev} = $self->{column};
6850     $self->{column}++;
6851     $self->{nc}
6852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6853     } else {
6854     $self->{set_nc}->($self);
6855     }
6856    
6857     redo A;
6858     } elsif ($self->{nc} == 0x003E) { # >
6859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6860     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6861    
6862     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6863     $self->{line_prev} = $self->{line};
6864     $self->{column_prev} = $self->{column};
6865     $self->{column}++;
6866     $self->{nc}
6867     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6868     } else {
6869     $self->{set_nc}->($self);
6870     }
6871    
6872     return ($self->{ct}); # ATTLIST
6873     redo A;
6874     } elsif ($self->{nc} == -1) {
6875     ## XML5: No parse error.
6876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6877     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6878    
6879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6880     $self->{line_prev} = $self->{line};
6881     $self->{column_prev} = $self->{column};
6882     $self->{column}++;
6883     $self->{nc}
6884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6885     } else {
6886     $self->{set_nc}->($self);
6887     }
6888    
6889     return ($self->{ct});
6890     redo A;
6891     } else {
6892     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6893     ## Stay in the state.
6894    
6895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896     $self->{line_prev} = $self->{line};
6897     $self->{column_prev} = $self->{column};
6898     $self->{column}++;
6899     $self->{nc}
6900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901     } else {
6902     $self->{set_nc}->($self);
6903     }
6904    
6905     redo A;
6906     }
6907     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6908     if ($is_space->{$self->{nc}}) {
6909     ## Stay in the state.
6910    
6911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912     $self->{line_prev} = $self->{line};
6913     $self->{column_prev} = $self->{column};
6914     $self->{column}++;
6915     $self->{nc}
6916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917     } else {
6918     $self->{set_nc}->($self);
6919     }
6920    
6921     redo A;
6922     } elsif ($self->{nc} == 0x007C) { # |
6923     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6924    
6925     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6926     $self->{line_prev} = $self->{line};
6927     $self->{column_prev} = $self->{column};
6928     $self->{column}++;
6929     $self->{nc}
6930     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6931     } else {
6932     $self->{set_nc}->($self);
6933     }
6934    
6935     redo A;
6936     } elsif ($self->{nc} == 0x0029) { # )
6937     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6938    
6939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940     $self->{line_prev} = $self->{line};
6941     $self->{column_prev} = $self->{column};
6942     $self->{column}++;
6943     $self->{nc}
6944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945     } else {
6946     $self->{set_nc}->($self);
6947     }
6948    
6949     redo A;
6950     } elsif ($self->{nc} == 0x003E) { # >
6951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6952     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6953    
6954     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6955     $self->{line_prev} = $self->{line};
6956     $self->{column_prev} = $self->{column};
6957     $self->{column}++;
6958     $self->{nc}
6959     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6960     } else {
6961     $self->{set_nc}->($self);
6962     }
6963    
6964     return ($self->{ct}); # ATTLIST
6965     redo A;
6966     } elsif ($self->{nc} == -1) {
6967     ## XML5: No parse error.
6968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6970    
6971     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6972     $self->{line_prev} = $self->{line};
6973     $self->{column_prev} = $self->{column};
6974     $self->{column}++;
6975     $self->{nc}
6976     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6977     } else {
6978     $self->{set_nc}->($self);
6979     }
6980    
6981     return ($self->{ct});
6982     redo A;
6983     } else {
6984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6985     line => $self->{line_prev},
6986     column => $self->{column_prev});
6987     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6988     $self->{state} = ALLOWED_TOKEN_STATE;
6989    
6990     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6991     $self->{line_prev} = $self->{line};
6992     $self->{column_prev} = $self->{column};
6993     $self->{column}++;
6994     $self->{nc}
6995     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6996     } else {
6997     $self->{set_nc}->($self);
6998     }
6999    
7000     redo A;
7001     }
7002     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7003     if ($is_space->{$self->{nc}}) {
7004     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7005    
7006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7007     $self->{line_prev} = $self->{line};
7008     $self->{column_prev} = $self->{column};
7009     $self->{column}++;
7010     $self->{nc}
7011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7012     } else {
7013     $self->{set_nc}->($self);
7014     }
7015    
7016     redo A;
7017     } elsif ($self->{nc} == 0x0023) { # #
7018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7019     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7020    
7021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7022     $self->{line_prev} = $self->{line};
7023     $self->{column_prev} = $self->{column};
7024     $self->{column}++;
7025     $self->{nc}
7026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7027     } else {
7028     $self->{set_nc}->($self);
7029     }
7030    
7031     redo A;
7032     } elsif ($self->{nc} == 0x0022) { # "
7033     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7034     $self->{ca}->{value} = '';
7035     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7036    
7037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7038     $self->{line_prev} = $self->{line};
7039     $self->{column_prev} = $self->{column};
7040     $self->{column}++;
7041     $self->{nc}
7042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7043     } else {
7044     $self->{set_nc}->($self);
7045     }
7046    
7047     redo A;
7048     } elsif ($self->{nc} == 0x0027) { # '
7049     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7050     $self->{ca}->{value} = '';
7051     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7052    
7053     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7054     $self->{line_prev} = $self->{line};
7055     $self->{column_prev} = $self->{column};
7056     $self->{column}++;
7057     $self->{nc}
7058     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7059     } else {
7060     $self->{set_nc}->($self);
7061     }
7062    
7063     redo A;
7064     } elsif ($self->{nc} == 0x003E) { # >
7065     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7066     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7067    
7068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069     $self->{line_prev} = $self->{line};
7070     $self->{column_prev} = $self->{column};
7071     $self->{column}++;
7072     $self->{nc}
7073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074     } else {
7075     $self->{set_nc}->($self);
7076     }
7077    
7078     return ($self->{ct}); # ATTLIST
7079     redo A;
7080     } elsif ($self->{nc} == -1) {
7081     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7082     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7083    
7084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085     $self->{line_prev} = $self->{line};
7086     $self->{column_prev} = $self->{column};
7087     $self->{column}++;
7088     $self->{nc}
7089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090     } else {
7091     $self->{set_nc}->($self);
7092     }
7093    
7094     return ($self->{ct});
7095     redo A;
7096     } else {
7097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7098     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7099     ## Reconsume.
7100     redo A;
7101     }
7102     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7103     if ($is_space->{$self->{nc}}) {
7104     ## Stay in the state.
7105    
7106     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7107     $self->{line_prev} = $self->{line};
7108     $self->{column_prev} = $self->{column};
7109     $self->{column}++;
7110     $self->{nc}
7111     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7112     } else {
7113     $self->{set_nc}->($self);
7114     }
7115    
7116     redo A;
7117     } elsif ($self->{nc} == 0x0023) { # #
7118     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7119    
7120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121     $self->{line_prev} = $self->{line};
7122     $self->{column_prev} = $self->{column};
7123     $self->{column}++;
7124     $self->{nc}
7125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126     } else {
7127     $self->{set_nc}->($self);
7128     }
7129    
7130     redo A;
7131     } elsif ($self->{nc} == 0x0022) { # "
7132     $self->{ca}->{value} = '';
7133     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7134    
7135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7136     $self->{line_prev} = $self->{line};
7137     $self->{column_prev} = $self->{column};
7138     $self->{column}++;
7139     $self->{nc}
7140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7141     } else {
7142     $self->{set_nc}->($self);
7143     }
7144    
7145     redo A;
7146     } elsif ($self->{nc} == 0x0027) { # '
7147     $self->{ca}->{value} = '';
7148     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7149    
7150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7151     $self->{line_prev} = $self->{line};
7152     $self->{column_prev} = $self->{column};
7153     $self->{column}++;
7154     $self->{nc}
7155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7156     } else {
7157     $self->{set_nc}->($self);
7158     }
7159    
7160     redo A;
7161     } elsif ($self->{nc} == 0x003E) { # >
7162     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7163     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7164    
7165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7166     $self->{line_prev} = $self->{line};
7167     $self->{column_prev} = $self->{column};
7168     $self->{column}++;
7169     $self->{nc}
7170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7171     } else {
7172     $self->{set_nc}->($self);
7173     }
7174    
7175     return ($self->{ct}); # ATTLIST
7176     redo A;
7177     } elsif ($self->{nc} == -1) {
7178     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7179     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7180    
7181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7182     $self->{line_prev} = $self->{line};
7183     $self->{column_prev} = $self->{column};
7184     $self->{column}++;
7185     $self->{nc}
7186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7187     } else {
7188     $self->{set_nc}->($self);
7189     }
7190    
7191     return ($self->{ct});
7192     redo A;
7193     } else {
7194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7195     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7196     ## Reconsume.
7197     redo A;
7198     }
7199     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7200     if ($is_space->{$self->{nc}}) {
7201     ## XML5: No parse error.
7202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7203 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7204 wakaba 1.15 ## Reconsume.
7205     redo A;
7206     } elsif ($self->{nc} == 0x0022) { # "
7207     ## XML5: Same as "anything else".
7208     $self->{ca}->{value} = '';
7209     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7210    
7211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212     $self->{line_prev} = $self->{line};
7213     $self->{column_prev} = $self->{column};
7214     $self->{column}++;
7215     $self->{nc}
7216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217     } else {
7218     $self->{set_nc}->($self);
7219     }
7220    
7221     redo A;
7222     } elsif ($self->{nc} == 0x0027) { # '
7223     ## XML5: Same as "anything else".
7224     $self->{ca}->{value} = '';
7225     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7226    
7227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228     $self->{line_prev} = $self->{line};
7229     $self->{column_prev} = $self->{column};
7230     $self->{column}++;
7231     $self->{nc}
7232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233     } else {
7234     $self->{set_nc}->($self);
7235     }
7236    
7237     redo A;
7238     } elsif ($self->{nc} == 0x003E) { # >
7239     ## XML5: Same as "anything else".
7240     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7241     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7242    
7243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244     $self->{line_prev} = $self->{line};
7245     $self->{column_prev} = $self->{column};
7246     $self->{column}++;
7247     $self->{nc}
7248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249     } else {
7250     $self->{set_nc}->($self);
7251     }
7252    
7253     return ($self->{ct}); # ATTLIST
7254     redo A;
7255     } elsif ($self->{nc} == -1) {
7256     ## XML5: No parse error.
7257     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7258     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7259    
7260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7261     $self->{line_prev} = $self->{line};
7262     $self->{column_prev} = $self->{column};
7263     $self->{column}++;
7264     $self->{nc}
7265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7266     } else {
7267     $self->{set_nc}->($self);
7268     }
7269    
7270     return ($self->{ct});
7271     redo A;
7272     } else {
7273     $self->{ca}->{default} = chr $self->{nc};
7274     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7275    
7276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7277     $self->{line_prev} = $self->{line};
7278     $self->{column_prev} = $self->{column};
7279     $self->{column}++;
7280     $self->{nc}
7281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7282     } else {
7283     $self->{set_nc}->($self);
7284     }
7285    
7286     redo A;
7287     }
7288     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7289     if ($is_space->{$self->{nc}}) {
7290     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7291    
7292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293     $self->{line_prev} = $self->{line};
7294     $self->{column_prev} = $self->{column};
7295     $self->{column}++;
7296     $self->{nc}
7297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298     } else {
7299     $self->{set_nc}->($self);
7300     }
7301    
7302     redo A;
7303     } elsif ($self->{nc} == 0x0022) { # "
7304     ## XML5: Same as "anything else".
7305     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7306     $self->{ca}->{value} = '';
7307     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7308    
7309     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7310     $self->{line_prev} = $self->{line};
7311     $self->{column_prev} = $self->{column};
7312     $self->{column}++;
7313     $self->{nc}
7314     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7315     } else {
7316     $self->{set_nc}->($self);
7317     }
7318    
7319     redo A;
7320     } elsif ($self->{nc} == 0x0027) { # '
7321     ## XML5: Same as "anything else".
7322     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7323     $self->{ca}->{value} = '';
7324     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7325    
7326     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7327     $self->{line_prev} = $self->{line};
7328     $self->{column_prev} = $self->{column};
7329     $self->{column}++;
7330     $self->{nc}
7331     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7332     } else {
7333     $self->{set_nc}->($self);
7334     }
7335    
7336     redo A;
7337     } elsif ($self->{nc} == 0x003E) { # >
7338     ## XML5: Same as "anything else".
7339     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7340     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7341    
7342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7343     $self->{line_prev} = $self->{line};
7344     $self->{column_prev} = $self->{column};
7345     $self->{column}++;
7346     $self->{nc}
7347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7348     } else {
7349     $self->{set_nc}->($self);
7350     }
7351    
7352     return ($self->{ct}); # ATTLIST
7353     redo A;
7354     } elsif ($self->{nc} == -1) {
7355     ## XML5: No parse error.
7356     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7357     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7358     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7359    
7360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7361     $self->{line_prev} = $self->{line};
7362     $self->{column_prev} = $self->{column};
7363     $self->{column}++;
7364     $self->{nc}
7365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7366     } else {
7367     $self->{set_nc}->($self);
7368     }
7369    
7370     return ($self->{ct});
7371     redo A;
7372     } else {
7373     $self->{ca}->{default} .= chr $self->{nc};
7374     ## Stay in the state.
7375    
7376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7377     $self->{line_prev} = $self->{line};
7378     $self->{column_prev} = $self->{column};
7379     $self->{column}++;
7380     $self->{nc}
7381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7382     } else {
7383     $self->{set_nc}->($self);
7384     }
7385    
7386     redo A;
7387     }
7388     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7389     if ($is_space->{$self->{nc}}) {
7390     ## Stay in the state.
7391    
7392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7393     $self->{line_prev} = $self->{line};
7394     $self->{column_prev} = $self->{column};
7395     $self->{column}++;
7396     $self->{nc}
7397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7398     } else {
7399     $self->{set_nc}->($self);
7400     }
7401    
7402     redo A;
7403     } elsif ($self->{nc} == 0x0022) { # "
7404     $self->{ca}->{value} = '';
7405     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7406    
7407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7408     $self->{line_prev} = $self->{line};
7409     $self->{column_prev} = $self->{column};
7410     $self->{column}++;
7411     $self->{nc}
7412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7413     } else {
7414     $self->{set_nc}->($self);
7415     }
7416    
7417     redo A;
7418     } elsif ($self->{nc} == 0x0027) { # '
7419     $self->{ca}->{value} = '';
7420     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7421    
7422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423     $self->{line_prev} = $self->{line};
7424     $self->{column_prev} = $self->{column};
7425     $self->{column}++;
7426     $self->{nc}
7427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428     } else {
7429     $self->{set_nc}->($self);
7430     }
7431    
7432     redo A;
7433     } elsif ($self->{nc} == 0x003E) { # >
7434     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7435     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7436    
7437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7438     $self->{line_prev} = $self->{line};
7439     $self->{column_prev} = $self->{column};
7440     $self->{column}++;
7441     $self->{nc}
7442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7443     } else {
7444     $self->{set_nc}->($self);
7445     }
7446    
7447     return ($self->{ct}); # ATTLIST
7448     redo A;
7449     } elsif ($self->{nc} == -1) {
7450     ## XML5: No parse error.
7451     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7452     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7453     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7454    
7455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7456     $self->{line_prev} = $self->{line};
7457     $self->{column_prev} = $self->{column};
7458     $self->{column}++;
7459     $self->{nc}
7460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7461     } else {
7462     $self->{set_nc}->($self);
7463     }
7464    
7465     return ($self->{ct});
7466     redo A;
7467     } else {
7468     ## XML5: Not defined yet.
7469     if ($self->{ca}->{default} eq 'FIXED') {
7470     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7471     } else {
7472     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7473     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7474     }
7475     ## Reconsume.
7476     redo A;
7477     }
7478     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7479     if ($is_space->{$self->{nc}} or
7480     $self->{nc} == -1 or
7481     $self->{nc} == 0x003E) { # >
7482     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7483     ## Reconsume.
7484     redo A;
7485     } else {
7486     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7487     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7488     ## Reconsume.
7489     redo A;
7490 wakaba 1.16 }
7491 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7492     ## ASCII case-insensitive
7493     if ($self->{nc} == [
7494     undef,
7495     0x0044, # D
7496     0x0041, # A
7497     0x0054, # T
7498     ]->[length $self->{kwd}] or
7499     $self->{nc} == [
7500     undef,
7501     0x0064, # d
7502     0x0061, # a
7503     0x0074, # t
7504     ]->[length $self->{kwd}]) {
7505    
7506     ## Stay in the state.
7507     $self->{kwd} .= chr $self->{nc};
7508    
7509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7510     $self->{line_prev} = $self->{line};
7511     $self->{column_prev} = $self->{column};
7512     $self->{column}++;
7513     $self->{nc}
7514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7515     } else {
7516     $self->{set_nc}->($self);
7517     }
7518    
7519     redo A;
7520     } elsif ((length $self->{kwd}) == 4 and
7521     ($self->{nc} == 0x0041 or # A
7522     $self->{nc} == 0x0061)) { # a
7523     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7524    
7525     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7526     text => 'NDATA',
7527     line => $self->{line_prev},
7528     column => $self->{column_prev} - 4);
7529     } else {
7530    
7531     }
7532     $self->{state} = AFTER_NDATA_STATE;
7533    
7534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7535     $self->{line_prev} = $self->{line};
7536     $self->{column_prev} = $self->{column};
7537     $self->{column}++;
7538     $self->{nc}
7539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7540     } else {
7541     $self->{set_nc}->($self);
7542     }
7543    
7544     redo A;
7545     } else {
7546     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7547     line => $self->{line_prev},
7548     column => $self->{column_prev} + 1
7549     - length $self->{kwd});
7550    
7551     $self->{state} = BOGUS_MD_STATE;
7552     ## Reconsume.
7553     redo A;
7554     }
7555     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7556     if ($is_space->{$self->{nc}}) {
7557     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7558    
7559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560     $self->{line_prev} = $self->{line};
7561     $self->{column_prev} = $self->{column};
7562     $self->{column}++;
7563     $self->{nc}
7564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565     } else {
7566     $self->{set_nc}->($self);
7567     }
7568    
7569     redo A;
7570     } elsif ($self->{nc} == 0x003E) { # >
7571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7572     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7573    
7574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7575     $self->{line_prev} = $self->{line};
7576     $self->{column_prev} = $self->{column};
7577     $self->{column}++;
7578     $self->{nc}
7579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7580     } else {
7581     $self->{set_nc}->($self);
7582     }
7583    
7584     return ($self->{ct}); # ENTITY
7585     redo A;
7586     } elsif ($self->{nc} == -1) {
7587     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7588     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7589    
7590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7591     $self->{line_prev} = $self->{line};
7592     $self->{column_prev} = $self->{column};
7593     $self->{column}++;
7594     $self->{nc}
7595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7596     } else {
7597     $self->{set_nc}->($self);
7598     }
7599    
7600     return ($self->{ct}); # ENTITY
7601     redo A;
7602     } else {
7603     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7604     line => $self->{line_prev},
7605     column => $self->{column_prev} + 1
7606     - length $self->{kwd});
7607     $self->{state} = BOGUS_MD_STATE;
7608     ## Reconsume.
7609     redo A;
7610     }
7611     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7612     if ($is_space->{$self->{nc}}) {
7613     ## Stay in the state.
7614    
7615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7616     $self->{line_prev} = $self->{line};
7617     $self->{column_prev} = $self->{column};
7618     $self->{column}++;
7619     $self->{nc}
7620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7621     } else {
7622     $self->{set_nc}->($self);
7623     }
7624    
7625     redo A;
7626     } elsif ($self->{nc} == 0x003E) { # >
7627     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7628     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7629    
7630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7631     $self->{line_prev} = $self->{line};
7632     $self->{column_prev} = $self->{column};
7633     $self->{column}++;
7634     $self->{nc}
7635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7636     } else {
7637     $self->{set_nc}->($self);
7638     }
7639    
7640     return ($self->{ct}); # ENTITY
7641     redo A;
7642     } elsif ($self->{nc} == -1) {
7643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7644     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7645    
7646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7647     $self->{line_prev} = $self->{line};
7648     $self->{column_prev} = $self->{column};
7649     $self->{column}++;
7650     $self->{nc}
7651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7652     } else {
7653     $self->{set_nc}->($self);
7654     }
7655    
7656     return ($self->{ct}); # ENTITY
7657     redo A;
7658     } else {
7659     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7660     $self->{state} = NOTATION_NAME_STATE;
7661    
7662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7663     $self->{line_prev} = $self->{line};
7664     $self->{column_prev} = $self->{column};
7665     $self->{column}++;
7666     $self->{nc}
7667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7668     } else {
7669     $self->{set_nc}->($self);
7670     }
7671    
7672     redo A;
7673     }
7674     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7675     if ($is_space->{$self->{nc}}) {
7676 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7677 wakaba 1.18
7678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7679     $self->{line_prev} = $self->{line};
7680     $self->{column_prev} = $self->{column};
7681     $self->{column}++;
7682     $self->{nc}
7683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7684     } else {
7685     $self->{set_nc}->($self);
7686     }
7687    
7688     redo A;
7689     } elsif ($self->{nc} == 0x003E) { # >
7690     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7691    
7692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693     $self->{line_prev} = $self->{line};
7694     $self->{column_prev} = $self->{column};
7695     $self->{column}++;
7696     $self->{nc}
7697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698     } else {
7699     $self->{set_nc}->($self);
7700     }
7701    
7702     return ($self->{ct}); # ENTITY
7703     redo A;
7704     } elsif ($self->{nc} == -1) {
7705     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7707    
7708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709     $self->{line_prev} = $self->{line};
7710     $self->{column_prev} = $self->{column};
7711     $self->{column}++;
7712     $self->{nc}
7713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714     } else {
7715     $self->{set_nc}->($self);
7716     }
7717    
7718     return ($self->{ct}); # ENTITY
7719     redo A;
7720     } else {
7721     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7722     ## Stay in the state.
7723    
7724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725     $self->{line_prev} = $self->{line};
7726     $self->{column_prev} = $self->{column};
7727     $self->{column}++;
7728     $self->{nc}
7729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730     } else {
7731     $self->{set_nc}->($self);
7732     }
7733    
7734     redo A;
7735     }
7736 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7737     if ($self->{nc} == 0x0022) { # "
7738 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7739 wakaba 1.19
7740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7741     $self->{line_prev} = $self->{line};
7742     $self->{column_prev} = $self->{column};
7743     $self->{column}++;
7744     $self->{nc}
7745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7746     } else {
7747     $self->{set_nc}->($self);
7748     }
7749    
7750     redo A;
7751     } elsif ($self->{nc} == 0x0026) { # &
7752     $self->{prev_state} = $self->{state};
7753     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7754     $self->{entity_add} = 0x0022; # "
7755    
7756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7757     $self->{line_prev} = $self->{line};
7758     $self->{column_prev} = $self->{column};
7759     $self->{column}++;
7760     $self->{nc}
7761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7762     } else {
7763     $self->{set_nc}->($self);
7764     }
7765    
7766     redo A;
7767     ## TODO: %
7768     } elsif ($self->{nc} == -1) {
7769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7771     ## Reconsume.
7772     return ($self->{ct}); # ENTITY
7773     redo A;
7774     } else {
7775     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7776    
7777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7778     $self->{line_prev} = $self->{line};
7779     $self->{column_prev} = $self->{column};
7780     $self->{column}++;
7781     $self->{nc}
7782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7783     } else {
7784     $self->{set_nc}->($self);
7785     }
7786    
7787     redo A;
7788     }
7789     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7790     if ($self->{nc} == 0x0027) { # '
7791 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7792 wakaba 1.19
7793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7794     $self->{line_prev} = $self->{line};
7795     $self->{column_prev} = $self->{column};
7796     $self->{column}++;
7797     $self->{nc}
7798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7799     } else {
7800     $self->{set_nc}->($self);
7801     }
7802    
7803     redo A;
7804     } elsif ($self->{nc} == 0x0026) { # &
7805     $self->{prev_state} = $self->{state};
7806     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7807     $self->{entity_add} = 0x0027; # '
7808    
7809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7810     $self->{line_prev} = $self->{line};
7811     $self->{column_prev} = $self->{column};
7812     $self->{column}++;
7813     $self->{nc}
7814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7815     } else {
7816     $self->{set_nc}->($self);
7817     }
7818    
7819     redo A;
7820     ## TODO: %
7821     } elsif ($self->{nc} == -1) {
7822     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7823     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7824     ## Reconsume.
7825     return ($self->{ct}); # ENTITY
7826     redo A;
7827     } else {
7828     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7829    
7830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7831     $self->{line_prev} = $self->{line};
7832     $self->{column_prev} = $self->{column};
7833     $self->{column}++;
7834     $self->{nc}
7835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7836     } else {
7837     $self->{set_nc}->($self);
7838     }
7839    
7840     redo A;
7841     }
7842     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7843     ## TODO: XMLize
7844    
7845     if ($is_space->{$self->{nc}} or
7846     {
7847     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7848     $self->{entity_add} => 1,
7849     }->{$self->{nc}}) {
7850     ## Don't consume
7851     ## No error
7852     ## Return nothing.
7853     #
7854     } elsif ($self->{nc} == 0x0023) { # #
7855     $self->{ca} = $self->{ct};
7856     $self->{state} = ENTITY_HASH_STATE;
7857     $self->{kwd} = '#';
7858    
7859     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7860     $self->{line_prev} = $self->{line};
7861     $self->{column_prev} = $self->{column};
7862     $self->{column}++;
7863     $self->{nc}
7864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7865     } else {
7866     $self->{set_nc}->($self);
7867     }
7868    
7869     redo A;
7870     } elsif ((0x0041 <= $self->{nc} and
7871     $self->{nc} <= 0x005A) or # A..Z
7872     (0x0061 <= $self->{nc} and
7873     $self->{nc} <= 0x007A)) { # a..z
7874     #
7875     } else {
7876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7877     ## Return nothing.
7878     #
7879     }
7880    
7881     $self->{ct}->{value} .= '&';
7882     $self->{state} = $self->{prev_state};
7883     ## Reconsume.
7884     redo A;
7885 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7886     if ($is_space->{$self->{nc}}) {
7887     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7888    
7889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7890     $self->{line_prev} = $self->{line};
7891     $self->{column_prev} = $self->{column};
7892     $self->{column}++;
7893     $self->{nc}
7894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7895     } else {
7896     $self->{set_nc}->($self);
7897     }
7898    
7899     redo A;
7900     } elsif ($self->{nc} == 0x0028) { # (
7901     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7902     $self->{ct}->{content} = ['('];
7903     $self->{group_depth} = 1;
7904    
7905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7906     $self->{line_prev} = $self->{line};
7907     $self->{column_prev} = $self->{column};
7908     $self->{column}++;
7909     $self->{nc}
7910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7911     } else {
7912     $self->{set_nc}->($self);
7913     }
7914    
7915     redo A;
7916     } elsif ($self->{nc} == 0x003E) { # >
7917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7919    
7920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7921     $self->{line_prev} = $self->{line};
7922     $self->{column_prev} = $self->{column};
7923     $self->{column}++;
7924     $self->{nc}
7925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7926     } else {
7927     $self->{set_nc}->($self);
7928     }
7929    
7930     return ($self->{ct}); # ELEMENT
7931     redo A;
7932     } elsif ($self->{nc} == -1) {
7933     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7934     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7935    
7936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7937     $self->{line_prev} = $self->{line};
7938     $self->{column_prev} = $self->{column};
7939     $self->{column}++;
7940     $self->{nc}
7941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7942     } else {
7943     $self->{set_nc}->($self);
7944     }
7945    
7946     return ($self->{ct}); # ELEMENT
7947     redo A;
7948     } else {
7949     $self->{ct}->{content} = [chr $self->{nc}];
7950     $self->{state} = CONTENT_KEYWORD_STATE;
7951    
7952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7953     $self->{line_prev} = $self->{line};
7954     $self->{column_prev} = $self->{column};
7955     $self->{column}++;
7956     $self->{nc}
7957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7958     } else {
7959     $self->{set_nc}->($self);
7960     }
7961    
7962     redo A;
7963     }
7964     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
7965     if ($is_space->{$self->{nc}}) {
7966     $self->{state} = AFTER_MD_DEF_STATE;
7967    
7968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7969     $self->{line_prev} = $self->{line};
7970     $self->{column_prev} = $self->{column};
7971     $self->{column}++;
7972     $self->{nc}
7973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7974     } else {
7975     $self->{set_nc}->($self);
7976     }
7977    
7978     redo A;
7979     } elsif ($self->{nc} == 0x003E) { # >
7980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7981    
7982     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7983     $self->{line_prev} = $self->{line};
7984     $self->{column_prev} = $self->{column};
7985     $self->{column}++;
7986     $self->{nc}
7987     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7988     } else {
7989     $self->{set_nc}->($self);
7990     }
7991    
7992     return ($self->{ct}); # ELEMENT
7993     redo A;
7994     } elsif ($self->{nc} == -1) {
7995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7997    
7998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7999     $self->{line_prev} = $self->{line};
8000     $self->{column_prev} = $self->{column};
8001     $self->{column}++;
8002     $self->{nc}
8003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8004     } else {
8005     $self->{set_nc}->($self);
8006     }
8007    
8008     return ($self->{ct}); # ELEMENT
8009     redo A;
8010     } else {
8011     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8012     ## Stay in the state.
8013    
8014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8015     $self->{line_prev} = $self->{line};
8016     $self->{column_prev} = $self->{column};
8017     $self->{column}++;
8018     $self->{nc}
8019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8020     } else {
8021     $self->{set_nc}->($self);
8022     }
8023    
8024     redo A;
8025     }
8026     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8027     if ($is_space->{$self->{nc}}) {
8028     ## Stay in the state.
8029    
8030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8031     $self->{line_prev} = $self->{line};
8032     $self->{column_prev} = $self->{column};
8033     $self->{column}++;
8034     $self->{nc}
8035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8036     } else {
8037     $self->{set_nc}->($self);
8038     }
8039    
8040     redo A;
8041     } elsif ($self->{nc} == 0x0028) { # (
8042     $self->{group_depth}++;
8043     push @{$self->{ct}->{content}}, chr $self->{nc};
8044     ## Stay in the state.
8045    
8046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047     $self->{line_prev} = $self->{line};
8048     $self->{column_prev} = $self->{column};
8049     $self->{column}++;
8050     $self->{nc}
8051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052     } else {
8053     $self->{set_nc}->($self);
8054     }
8055    
8056     redo A;
8057     } elsif ($self->{nc} == 0x007C or # |
8058     $self->{nc} == 0x002C) { # ,
8059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8060     ## Stay in the state.
8061    
8062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8063     $self->{line_prev} = $self->{line};
8064     $self->{column_prev} = $self->{column};
8065     $self->{column}++;
8066     $self->{nc}
8067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8068     } else {
8069     $self->{set_nc}->($self);
8070     }
8071    
8072     redo A;
8073     } elsif ($self->{nc} == 0x0029) { # )
8074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8075     push @{$self->{ct}->{content}}, chr $self->{nc};
8076     $self->{group_depth}--;
8077     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8078    
8079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8080     $self->{line_prev} = $self->{line};
8081     $self->{column_prev} = $self->{column};
8082     $self->{column}++;
8083     $self->{nc}
8084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8085     } else {
8086     $self->{set_nc}->($self);
8087     }
8088    
8089     redo A;
8090     } elsif ($self->{nc} == 0x003E) { # >
8091     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8092     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8094    
8095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8096     $self->{line_prev} = $self->{line};
8097     $self->{column_prev} = $self->{column};
8098     $self->{column}++;
8099     $self->{nc}
8100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8101     } else {
8102     $self->{set_nc}->($self);
8103     }
8104    
8105     return ($self->{ct}); # ELEMENT
8106     redo A;
8107     } elsif ($self->{nc} == -1) {
8108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8109     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8110     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8111    
8112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8113     $self->{line_prev} = $self->{line};
8114     $self->{column_prev} = $self->{column};
8115     $self->{column}++;
8116     $self->{nc}
8117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8118     } else {
8119     $self->{set_nc}->($self);
8120     }
8121    
8122     return ($self->{ct}); # ELEMENT
8123     redo A;
8124     } else {
8125     push @{$self->{ct}->{content}}, chr $self->{nc};
8126     $self->{state} = CM_ELEMENT_NAME_STATE;
8127    
8128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8129     $self->{line_prev} = $self->{line};
8130     $self->{column_prev} = $self->{column};
8131     $self->{column}++;
8132     $self->{nc}
8133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8134     } else {
8135     $self->{set_nc}->($self);
8136     }
8137    
8138     redo A;
8139     }
8140     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8141     if ($is_space->{$self->{nc}}) {
8142     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8143    
8144     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8145     $self->{line_prev} = $self->{line};
8146     $self->{column_prev} = $self->{column};
8147     $self->{column}++;
8148     $self->{nc}
8149     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8150     } else {
8151     $self->{set_nc}->($self);
8152     }
8153    
8154     redo A;
8155     } elsif ($self->{nc} == 0x002A or # *
8156     $self->{nc} == 0x002B or # +
8157     $self->{nc} == 0x003F) { # ?
8158     push @{$self->{ct}->{content}}, chr $self->{nc};
8159     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8160    
8161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8162     $self->{line_prev} = $self->{line};
8163     $self->{column_prev} = $self->{column};
8164     $self->{column}++;
8165     $self->{nc}
8166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8167     } else {
8168     $self->{set_nc}->($self);
8169     }
8170    
8171     redo A;
8172     } elsif ($self->{nc} == 0x007C or # |
8173     $self->{nc} == 0x002C) { # ,
8174     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8175     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8176    
8177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178     $self->{line_prev} = $self->{line};
8179     $self->{column_prev} = $self->{column};
8180     $self->{column}++;
8181     $self->{nc}
8182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183     } else {
8184     $self->{set_nc}->($self);
8185     }
8186    
8187     redo A;
8188     } elsif ($self->{nc} == 0x0029) { # )
8189     $self->{group_depth}--;
8190     push @{$self->{ct}->{content}}, chr $self->{nc};
8191     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8192    
8193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8194     $self->{line_prev} = $self->{line};
8195     $self->{column_prev} = $self->{column};
8196     $self->{column}++;
8197     $self->{nc}
8198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8199     } else {
8200     $self->{set_nc}->($self);
8201     }
8202    
8203     redo A;
8204     } elsif ($self->{nc} == 0x003E) { # >
8205     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8206     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8208    
8209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8210     $self->{line_prev} = $self->{line};
8211     $self->{column_prev} = $self->{column};
8212     $self->{column}++;
8213     $self->{nc}
8214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8215     } else {
8216     $self->{set_nc}->($self);
8217     }
8218    
8219     return ($self->{ct}); # ELEMENT
8220     redo A;
8221     } elsif ($self->{nc} == -1) {
8222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8223     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8224     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8225    
8226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8227     $self->{line_prev} = $self->{line};
8228     $self->{column_prev} = $self->{column};
8229     $self->{column}++;
8230     $self->{nc}
8231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8232     } else {
8233     $self->{set_nc}->($self);
8234     }
8235    
8236     return ($self->{ct}); # ELEMENT
8237     redo A;
8238     } else {
8239     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8240     ## Stay in the state.
8241    
8242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8243     $self->{line_prev} = $self->{line};
8244     $self->{column_prev} = $self->{column};
8245     $self->{column}++;
8246     $self->{nc}
8247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8248     } else {
8249     $self->{set_nc}->($self);
8250     }
8251    
8252     redo A;
8253     }
8254     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8255     if ($is_space->{$self->{nc}}) {
8256     ## Stay in the state.
8257    
8258     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8259     $self->{line_prev} = $self->{line};
8260     $self->{column_prev} = $self->{column};
8261     $self->{column}++;
8262     $self->{nc}
8263     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8264     } else {
8265     $self->{set_nc}->($self);
8266     }
8267    
8268     redo A;
8269     } elsif ($self->{nc} == 0x007C or # |
8270     $self->{nc} == 0x002C) { # ,
8271     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8272     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8273    
8274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8275     $self->{line_prev} = $self->{line};
8276     $self->{column_prev} = $self->{column};
8277     $self->{column}++;
8278     $self->{nc}
8279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8280     } else {
8281     $self->{set_nc}->($self);
8282     }
8283    
8284     redo A;
8285     } elsif ($self->{nc} == 0x0029) { # )
8286     $self->{group_depth}--;
8287     push @{$self->{ct}->{content}}, chr $self->{nc};
8288     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8289    
8290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8291     $self->{line_prev} = $self->{line};
8292     $self->{column_prev} = $self->{column};
8293     $self->{column}++;
8294     $self->{nc}
8295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8296     } else {
8297     $self->{set_nc}->($self);
8298     }
8299    
8300     redo A;
8301     } elsif ($self->{nc} == 0x003E) { # >
8302     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8303     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8304     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8305    
8306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8307     $self->{line_prev} = $self->{line};
8308     $self->{column_prev} = $self->{column};
8309     $self->{column}++;
8310     $self->{nc}
8311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8312     } else {
8313     $self->{set_nc}->($self);
8314     }
8315    
8316     return ($self->{ct}); # ELEMENT
8317     redo A;
8318     } elsif ($self->{nc} == -1) {
8319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8320     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8321     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8322    
8323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8324     $self->{line_prev} = $self->{line};
8325     $self->{column_prev} = $self->{column};
8326     $self->{column}++;
8327     $self->{nc}
8328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8329     } else {
8330     $self->{set_nc}->($self);
8331     }
8332    
8333     return ($self->{ct}); # ELEMENT
8334     redo A;
8335     } else {
8336     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8337     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8338     $self->{state} = BOGUS_MD_STATE;
8339    
8340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8341     $self->{line_prev} = $self->{line};
8342     $self->{column_prev} = $self->{column};
8343     $self->{column}++;
8344     $self->{nc}
8345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8346     } else {
8347     $self->{set_nc}->($self);
8348     }
8349    
8350     redo A;
8351     }
8352     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8353     if ($is_space->{$self->{nc}}) {
8354     if ($self->{group_depth}) {
8355     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8356     } else {
8357     $self->{state} = AFTER_MD_DEF_STATE;
8358     }
8359    
8360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361     $self->{line_prev} = $self->{line};
8362     $self->{column_prev} = $self->{column};
8363     $self->{column}++;
8364     $self->{nc}
8365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366     } else {
8367     $self->{set_nc}->($self);
8368     }
8369    
8370     redo A;
8371     } elsif ($self->{nc} == 0x002A or # *
8372     $self->{nc} == 0x002B or # +
8373     $self->{nc} == 0x003F) { # ?
8374     push @{$self->{ct}->{content}}, chr $self->{nc};
8375     if ($self->{group_depth}) {
8376     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8377     } else {
8378     $self->{state} = AFTER_MD_DEF_STATE;
8379     }
8380    
8381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8382     $self->{line_prev} = $self->{line};
8383     $self->{column_prev} = $self->{column};
8384     $self->{column}++;
8385     $self->{nc}
8386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8387     } else {
8388     $self->{set_nc}->($self);
8389     }
8390    
8391     redo A;
8392     } elsif ($self->{nc} == 0x0029) { # )
8393     if ($self->{group_depth}) {
8394     $self->{group_depth}--;
8395     push @{$self->{ct}->{content}}, chr $self->{nc};
8396     ## Stay in the state.
8397    
8398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8399     $self->{line_prev} = $self->{line};
8400     $self->{column_prev} = $self->{column};
8401     $self->{column}++;
8402     $self->{nc}
8403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8404     } else {
8405     $self->{set_nc}->($self);
8406     }
8407    
8408     redo A;
8409     } else {
8410     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8411     $self->{state} = BOGUS_MD_STATE;
8412     ## Reconsume.
8413     redo A;
8414     }
8415     } elsif ($self->{nc} == 0x003E) { # >
8416     if ($self->{group_depth}) {
8417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8418     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8419     }
8420     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421    
8422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423     $self->{line_prev} = $self->{line};
8424     $self->{column_prev} = $self->{column};
8425     $self->{column}++;
8426     $self->{nc}
8427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428     } else {
8429     $self->{set_nc}->($self);
8430     }
8431    
8432     return ($self->{ct}); # ELEMENT
8433     redo A;
8434     } elsif ($self->{nc} == -1) {
8435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8436     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8437     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8438    
8439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440     $self->{line_prev} = $self->{line};
8441     $self->{column_prev} = $self->{column};
8442     $self->{column}++;
8443     $self->{nc}
8444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445     } else {
8446     $self->{set_nc}->($self);
8447     }
8448    
8449     return ($self->{ct}); # ELEMENT
8450     redo A;
8451     } else {
8452     if ($self->{group_depth}) {
8453     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454     } else {
8455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8456     $self->{state} = BOGUS_MD_STATE;
8457     }
8458     ## Reconsume.
8459     redo A;
8460     }
8461     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8462 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8463     ## Stay in the state.
8464    
8465     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8466     $self->{line_prev} = $self->{line};
8467     $self->{column_prev} = $self->{column};
8468     $self->{column}++;
8469     $self->{nc}
8470     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8471     } else {
8472     $self->{set_nc}->($self);
8473     }
8474    
8475     redo A;
8476     } elsif ($self->{nc} == 0x003E) { # >
8477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8478    
8479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8480     $self->{line_prev} = $self->{line};
8481     $self->{column_prev} = $self->{column};
8482     $self->{column}++;
8483     $self->{nc}
8484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8485     } else {
8486     $self->{set_nc}->($self);
8487     }
8488    
8489 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8490 wakaba 1.18 redo A;
8491     } elsif ($self->{nc} == -1) {
8492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8493     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8494    
8495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8496     $self->{line_prev} = $self->{line};
8497     $self->{column_prev} = $self->{column};
8498     $self->{column}++;
8499     $self->{nc}
8500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8501     } else {
8502     $self->{set_nc}->($self);
8503     }
8504    
8505 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8506 wakaba 1.18 redo A;
8507     } else {
8508 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8509 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8510     ## Reconsume.
8511     redo A;
8512     }
8513 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8514     if ($self->{nc} == 0x003E) { # >
8515     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8516    
8517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8518     $self->{line_prev} = $self->{line};
8519     $self->{column_prev} = $self->{column};
8520     $self->{column}++;
8521     $self->{nc}
8522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8523     } else {
8524     $self->{set_nc}->($self);
8525     }
8526    
8527     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8528     redo A;
8529     } elsif ($self->{nc} == -1) {
8530     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8531     ## Reconsume.
8532     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8533     redo A;
8534     } else {
8535     ## Stay in the state.
8536    
8537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538     $self->{line_prev} = $self->{line};
8539     $self->{column_prev} = $self->{column};
8540     $self->{column}++;
8541     $self->{nc}
8542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543     } else {
8544     $self->{set_nc}->($self);
8545     }
8546    
8547     redo A;
8548     }
8549 wakaba 1.1 } else {
8550     die "$0: $self->{state}: Unknown state";
8551     }
8552     } # A
8553    
8554     die "$0: _get_next_token: unexpected case";
8555     } # _get_next_token
8556    
8557     1;
8558 wakaba 1.20 ## $Date: 2008/10/19 07:19:00 $
8559 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24