/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.29 - (hide annotations) (download)
Sun Aug 16 04:06:34 2009 UTC (15 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.28: +40 -5 lines
++ whatpm/t/ChangeLog	16 Aug 2009 04:05:04 -0000
	* tree-test-1.dat, tree-test-3.dat, tree-test-flow.dat,
	tree-test-foreign.dat, tree-test-form.dat, tree-test-phrasing.dat,
	tokenizer-test-1.test, tokenizer-test-2.dat, tokenizer-test-3.dat:
	DOCTYPE names are now normalized to its lowercased form (HTML5
	revision 2502).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	16 Aug 2009 04:06:26 -0000
2009-08-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Lowercase-fold doctype names (HTML5 revision
	2501, cf. HTML5 revision 3571).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.29 our $VERSION=do{my @r=(q$Revision: 1.28 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744 wakaba 1.1
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819 wakaba 1.25 } elsif ($self->{is_xml} and
1820     $is_space->{$self->{nc}}) {
1821    
1822     $self->{ca}->{value} .= ' ';
1823     ## Stay in the state.
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836 wakaba 1.1 } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 wakaba 1.15
1842     $self->{state} = DATA_STATE;
1843     $self->{s_kwd} = '';
1844     ## reconsume
1845     return ($self->{ct}); # start tag
1846     redo A;
1847 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849     if ($self->{ct}->{attributes}) {
1850    
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852     } else {
1853     ## NOTE: This state should never be reached.
1854    
1855     }
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # end tag
1861     redo A;
1862     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863     ## XML5: No parse error above; not defined yet.
1864     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866     ## Reconsume.
1867     return ($self->{ct}); # ATTLIST
1868     redo A;
1869 wakaba 1.1 } else {
1870     die "$0: $self->{ct}->{type}: Unknown token type";
1871     }
1872     } else {
1873 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1874 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875    
1876     ## XML5: Not a parse error.
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878     } else {
1879    
1880     }
1881 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1882     $self->{read_until}->($self->{ca}->{value},
1883 wakaba 1.25 qq["&<\x09\x0C\x20],
1884 wakaba 1.1 length $self->{ca}->{value});
1885    
1886     ## Stay in the state
1887    
1888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889     $self->{line_prev} = $self->{line};
1890     $self->{column_prev} = $self->{column};
1891     $self->{column}++;
1892     $self->{nc}
1893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894     } else {
1895     $self->{set_nc}->($self);
1896     }
1897    
1898     redo A;
1899     }
1900     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902     ## ATTLIST attribute value single quoted state".
1903 wakaba 1.11
1904 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1905 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906    
1907     ## XML5: "DOCTYPE ATTLIST name after state".
1908     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910     } else {
1911    
1912     ## XML5: "Before attribute name state" (sic).
1913     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914     }
1915 wakaba 1.1
1916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917     $self->{line_prev} = $self->{line};
1918     $self->{column_prev} = $self->{column};
1919     $self->{column}++;
1920     $self->{nc}
1921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922     } else {
1923     $self->{set_nc}->($self);
1924     }
1925    
1926     redo A;
1927     } elsif ($self->{nc} == 0x0026) { # &
1928    
1929 wakaba 1.11 ## XML5: Not defined yet.
1930    
1931 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1932     ## "entity in attribute value state". In this implementation, the
1933     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934     ## implementation of the "consume a character reference" algorithm.
1935     $self->{entity_add} = 0x0027; # '
1936     $self->{prev_state} = $self->{state};
1937     $self->{state} = ENTITY_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949     redo A;
1950 wakaba 1.25 } elsif ($self->{is_xml} and
1951     $is_space->{$self->{nc}}) {
1952    
1953     $self->{ca}->{value} .= ' ';
1954     ## Stay in the state.
1955    
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966     redo A;
1967 wakaba 1.1 } elsif ($self->{nc} == -1) {
1968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972 wakaba 1.15
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     ## reconsume
1976     return ($self->{ct}); # start tag
1977     redo A;
1978 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980     if ($self->{ct}->{attributes}) {
1981    
1982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983     } else {
1984     ## NOTE: This state should never be reached.
1985    
1986     }
1987 wakaba 1.15
1988     $self->{state} = DATA_STATE;
1989     $self->{s_kwd} = '';
1990     ## reconsume
1991     return ($self->{ct}); # end tag
1992     redo A;
1993     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994     ## XML5: No parse error above; not defined yet.
1995     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     ## Reconsume.
1998     return ($self->{ct}); # ATTLIST
1999     redo A;
2000 wakaba 1.1 } else {
2001     die "$0: $self->{ct}->{type}: Unknown token type";
2002     }
2003     } else {
2004 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2005 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006    
2007     ## XML5: Not a parse error.
2008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009     } else {
2010    
2011     }
2012 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2013     $self->{read_until}->($self->{ca}->{value},
2014 wakaba 1.25 qq['&<\x09\x0C\x20],
2015 wakaba 1.1 length $self->{ca}->{value});
2016    
2017     ## Stay in the state
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     }
2031     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2033    
2034 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2035 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036    
2037     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039     } else {
2040    
2041     ## XML5: "Tag attribute name before state".
2042     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043     }
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055     redo A;
2056     } elsif ($self->{nc} == 0x0026) { # &
2057    
2058 wakaba 1.11
2059     ## XML5: Not defined yet.
2060    
2061 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2062     ## "entity in attribute value state". In this implementation, the
2063     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064     ## implementation of the "consume a character reference" algorithm.
2065     $self->{entity_add} = -1;
2066     $self->{prev_state} = $self->{state};
2067     $self->{state} = ENTITY_STATE;
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x003E) { # >
2081     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082    
2083     $self->{last_stag_name} = $self->{ct}->{tag_name};
2084 wakaba 1.15
2085     $self->{state} = DATA_STATE;
2086     $self->{s_kwd} = '';
2087    
2088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098     return ($self->{ct}); # start tag
2099     redo A;
2100 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102     if ($self->{ct}->{attributes}) {
2103    
2104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105     } else {
2106     ## NOTE: This state should never be reached.
2107    
2108     }
2109 wakaba 1.15
2110     $self->{state} = DATA_STATE;
2111     $self->{s_kwd} = '';
2112    
2113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114     $self->{line_prev} = $self->{line};
2115     $self->{column_prev} = $self->{column};
2116     $self->{column}++;
2117     $self->{nc}
2118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119     } else {
2120     $self->{set_nc}->($self);
2121     }
2122    
2123     return ($self->{ct}); # end tag
2124     redo A;
2125     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128    
2129 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130     $self->{line_prev} = $self->{line};
2131     $self->{column_prev} = $self->{column};
2132     $self->{column}++;
2133     $self->{nc}
2134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135     } else {
2136     $self->{set_nc}->($self);
2137     }
2138    
2139 wakaba 1.15 return ($self->{ct}); # ATTLIST
2140     redo A;
2141     } else {
2142     die "$0: $self->{ct}->{type}: Unknown token type";
2143     }
2144 wakaba 1.1 } elsif ($self->{nc} == -1) {
2145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146    
2147 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149 wakaba 1.15
2150     $self->{state} = DATA_STATE;
2151     $self->{s_kwd} = '';
2152     ## reconsume
2153     return ($self->{ct}); # start tag
2154     redo A;
2155 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158     if ($self->{ct}->{attributes}) {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161     } else {
2162     ## NOTE: This state should never be reached.
2163    
2164     }
2165 wakaba 1.15
2166     $self->{state} = DATA_STATE;
2167     $self->{s_kwd} = '';
2168     ## reconsume
2169     return ($self->{ct}); # end tag
2170     redo A;
2171     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175     ## Reconsume.
2176     return ($self->{ct}); # ATTLIST
2177     redo A;
2178 wakaba 1.1 } else {
2179     die "$0: $self->{ct}->{type}: Unknown token type";
2180     }
2181     } else {
2182     if ({
2183     0x0022 => 1, # "
2184     0x0027 => 1, # '
2185     0x003D => 1, # =
2186 wakaba 1.26 0x003C => 1, # <
2187 wakaba 1.1 }->{$self->{nc}}) {
2188    
2189 wakaba 1.11 ## XML5: Not a parse error.
2190 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2191     } else {
2192    
2193     }
2194     $self->{ca}->{value} .= chr ($self->{nc});
2195     $self->{read_until}->($self->{ca}->{value},
2196 wakaba 1.25 qq["'=& \x09\x0C>],
2197 wakaba 1.1 length $self->{ca}->{value});
2198    
2199     ## Stay in the state
2200    
2201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2202     $self->{line_prev} = $self->{line};
2203     $self->{column_prev} = $self->{column};
2204     $self->{column}++;
2205     $self->{nc}
2206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2207     } else {
2208     $self->{set_nc}->($self);
2209     }
2210    
2211     redo A;
2212     }
2213     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2214     if ($is_space->{$self->{nc}}) {
2215    
2216     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2217    
2218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219     $self->{line_prev} = $self->{line};
2220     $self->{column_prev} = $self->{column};
2221     $self->{column}++;
2222     $self->{nc}
2223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224     } else {
2225     $self->{set_nc}->($self);
2226     }
2227    
2228     redo A;
2229     } elsif ($self->{nc} == 0x003E) { # >
2230     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2231    
2232     $self->{last_stag_name} = $self->{ct}->{tag_name};
2233     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2234     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2235     if ($self->{ct}->{attributes}) {
2236    
2237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2238     } else {
2239     ## NOTE: This state should never be reached.
2240    
2241     }
2242     } else {
2243     die "$0: $self->{ct}->{type}: Unknown token type";
2244     }
2245     $self->{state} = DATA_STATE;
2246 wakaba 1.5 $self->{s_kwd} = '';
2247 wakaba 1.1
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258    
2259     return ($self->{ct}); # start tag or end tag
2260    
2261     redo A;
2262     } elsif ($self->{nc} == 0x002F) { # /
2263    
2264     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2265    
2266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2267     $self->{line_prev} = $self->{line};
2268     $self->{column_prev} = $self->{column};
2269     $self->{column}++;
2270     $self->{nc}
2271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2272     } else {
2273     $self->{set_nc}->($self);
2274     }
2275    
2276     redo A;
2277     } elsif ($self->{nc} == -1) {
2278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2279     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2280    
2281     $self->{last_stag_name} = $self->{ct}->{tag_name};
2282     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2283     if ($self->{ct}->{attributes}) {
2284    
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2286     } else {
2287     ## NOTE: This state should never be reached.
2288    
2289     }
2290     } else {
2291     die "$0: $self->{ct}->{type}: Unknown token type";
2292     }
2293     $self->{state} = DATA_STATE;
2294 wakaba 1.5 $self->{s_kwd} = '';
2295 wakaba 1.1 ## Reconsume.
2296     return ($self->{ct}); # start tag or end tag
2297     redo A;
2298     } else {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2301     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2302     ## reconsume
2303     redo A;
2304     }
2305     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2306 wakaba 1.11 ## XML5: "Empty tag state".
2307    
2308 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2309     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2312     ## TODO: Different type than slash in start tag
2313     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318    
2319     }
2320     ## TODO: Test |<title></title/>|
2321     } else {
2322    
2323     $self->{self_closing} = 1;
2324     }
2325    
2326     $self->{state} = DATA_STATE;
2327 wakaba 1.5 $self->{s_kwd} = '';
2328 wakaba 1.1
2329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2330     $self->{line_prev} = $self->{line};
2331     $self->{column_prev} = $self->{column};
2332     $self->{column}++;
2333     $self->{nc}
2334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2335     } else {
2336     $self->{set_nc}->($self);
2337     }
2338    
2339    
2340     return ($self->{ct}); # start tag or end tag
2341    
2342     redo A;
2343     } elsif ($self->{nc} == -1) {
2344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2345     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2346    
2347     $self->{last_stag_name} = $self->{ct}->{tag_name};
2348     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2349     if ($self->{ct}->{attributes}) {
2350    
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2352     } else {
2353     ## NOTE: This state should never be reached.
2354    
2355     }
2356     } else {
2357     die "$0: $self->{ct}->{type}: Unknown token type";
2358     }
2359 wakaba 1.11 ## XML5: "Tag attribute name before state".
2360 wakaba 1.1 $self->{state} = DATA_STATE;
2361 wakaba 1.5 $self->{s_kwd} = '';
2362 wakaba 1.1 ## Reconsume.
2363     return ($self->{ct}); # start tag or end tag
2364     redo A;
2365     } else {
2366    
2367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2368     ## TODO: This error type is wrong.
2369     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2370     ## Reconsume.
2371     redo A;
2372     }
2373     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2374 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2375    
2376 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2377     ## consumes characters one-by-one basis.
2378    
2379     if ($self->{nc} == 0x003E) { # >
2380 wakaba 1.13 if ($self->{in_subset}) {
2381    
2382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2383     } else {
2384    
2385     $self->{state} = DATA_STATE;
2386     $self->{s_kwd} = '';
2387     }
2388 wakaba 1.1
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399    
2400     return ($self->{ct}); # comment
2401     redo A;
2402     } elsif ($self->{nc} == -1) {
2403 wakaba 1.13 if ($self->{in_subset}) {
2404    
2405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2406     } else {
2407    
2408     $self->{state} = DATA_STATE;
2409     $self->{s_kwd} = '';
2410     }
2411 wakaba 1.1 ## reconsume
2412    
2413     return ($self->{ct}); # comment
2414     redo A;
2415     } else {
2416    
2417     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2418     $self->{read_until}->($self->{ct}->{data},
2419     q[>],
2420     length $self->{ct}->{data});
2421    
2422     ## Stay in the state.
2423    
2424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425     $self->{line_prev} = $self->{line};
2426     $self->{column_prev} = $self->{column};
2427     $self->{column}++;
2428     $self->{nc}
2429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430     } else {
2431     $self->{set_nc}->($self);
2432     }
2433    
2434     redo A;
2435     }
2436     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2437 wakaba 1.14 ## XML5: "Markup declaration state".
2438 wakaba 1.1
2439     if ($self->{nc} == 0x002D) { # -
2440    
2441     $self->{state} = MD_HYPHEN_STATE;
2442    
2443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2444     $self->{line_prev} = $self->{line};
2445     $self->{column_prev} = $self->{column};
2446     $self->{column}++;
2447     $self->{nc}
2448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2449     } else {
2450     $self->{set_nc}->($self);
2451     }
2452    
2453     redo A;
2454     } elsif ($self->{nc} == 0x0044 or # D
2455     $self->{nc} == 0x0064) { # d
2456     ## ASCII case-insensitive.
2457    
2458     $self->{state} = MD_DOCTYPE_STATE;
2459 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2460 wakaba 1.1
2461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2462     $self->{line_prev} = $self->{line};
2463     $self->{column_prev} = $self->{column};
2464     $self->{column}++;
2465     $self->{nc}
2466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2467     } else {
2468     $self->{set_nc}->($self);
2469     }
2470    
2471     redo A;
2472 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2473     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2474     $self->{is_xml}) and
2475 wakaba 1.1 $self->{nc} == 0x005B) { # [
2476    
2477     $self->{state} = MD_CDATA_STATE;
2478 wakaba 1.12 $self->{kwd} = '[';
2479 wakaba 1.1
2480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481     $self->{line_prev} = $self->{line};
2482     $self->{column_prev} = $self->{column};
2483     $self->{column}++;
2484     $self->{nc}
2485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486     } else {
2487     $self->{set_nc}->($self);
2488     }
2489    
2490     redo A;
2491     } else {
2492    
2493     }
2494    
2495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2496     line => $self->{line_prev},
2497     column => $self->{column_prev} - 1);
2498     ## Reconsume.
2499     $self->{state} = BOGUS_COMMENT_STATE;
2500     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2501     line => $self->{line_prev},
2502     column => $self->{column_prev} - 1,
2503     };
2504     redo A;
2505     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2506     if ($self->{nc} == 0x002D) { # -
2507    
2508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509     line => $self->{line_prev},
2510     column => $self->{column_prev} - 2,
2511     };
2512 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2513 wakaba 1.1
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } else {
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2528     line => $self->{line_prev},
2529     column => $self->{column_prev} - 2);
2530     $self->{state} = BOGUS_COMMENT_STATE;
2531     ## Reconsume.
2532     $self->{ct} = {type => COMMENT_TOKEN,
2533     data => '-',
2534     line => $self->{line_prev},
2535     column => $self->{column_prev} - 2,
2536     };
2537     redo A;
2538     }
2539     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2540     ## ASCII case-insensitive.
2541     if ($self->{nc} == [
2542     undef,
2543     0x004F, # O
2544     0x0043, # C
2545     0x0054, # T
2546     0x0059, # Y
2547     0x0050, # P
2548 wakaba 1.12 ]->[length $self->{kwd}] or
2549 wakaba 1.1 $self->{nc} == [
2550     undef,
2551     0x006F, # o
2552     0x0063, # c
2553     0x0074, # t
2554     0x0079, # y
2555     0x0070, # p
2556 wakaba 1.12 ]->[length $self->{kwd}]) {
2557 wakaba 1.1
2558     ## Stay in the state.
2559 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2560 wakaba 1.1
2561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562     $self->{line_prev} = $self->{line};
2563     $self->{column_prev} = $self->{column};
2564     $self->{column}++;
2565     $self->{nc}
2566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567     } else {
2568     $self->{set_nc}->($self);
2569     }
2570    
2571     redo A;
2572 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2573 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2574     $self->{nc} == 0x0065)) { # e
2575 wakaba 1.12 if ($self->{is_xml} and
2576     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2577 wakaba 1.10
2578     ## XML5: case-sensitive.
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2580     text => 'DOCTYPE',
2581     line => $self->{line_prev},
2582     column => $self->{column_prev} - 5);
2583     } else {
2584    
2585     }
2586 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2587     $self->{ct} = {type => DOCTYPE_TOKEN,
2588     quirks => 1,
2589     line => $self->{line_prev},
2590     column => $self->{column_prev} - 7,
2591     };
2592    
2593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2594     $self->{line_prev} = $self->{line};
2595     $self->{column_prev} = $self->{column};
2596     $self->{column}++;
2597     $self->{nc}
2598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2599     } else {
2600     $self->{set_nc}->($self);
2601     }
2602    
2603     redo A;
2604     } else {
2605    
2606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2607     line => $self->{line_prev},
2608 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2609 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2610     ## Reconsume.
2611     $self->{ct} = {type => COMMENT_TOKEN,
2612 wakaba 1.12 data => $self->{kwd},
2613 wakaba 1.1 line => $self->{line_prev},
2614 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2615 wakaba 1.1 };
2616     redo A;
2617     }
2618     } elsif ($self->{state} == MD_CDATA_STATE) {
2619     if ($self->{nc} == {
2620     '[' => 0x0043, # C
2621     '[C' => 0x0044, # D
2622     '[CD' => 0x0041, # A
2623     '[CDA' => 0x0054, # T
2624     '[CDAT' => 0x0041, # A
2625 wakaba 1.12 }->{$self->{kwd}}) {
2626 wakaba 1.1
2627     ## Stay in the state.
2628 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2629 wakaba 1.1
2630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631     $self->{line_prev} = $self->{line};
2632     $self->{column_prev} = $self->{column};
2633     $self->{column}++;
2634     $self->{nc}
2635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2636     } else {
2637     $self->{set_nc}->($self);
2638     }
2639    
2640     redo A;
2641 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2642 wakaba 1.1 $self->{nc} == 0x005B) { # [
2643 wakaba 1.6 if ($self->{is_xml} and
2644     not $self->{tainted} and
2645     @{$self->{open_elements} or []} == 0) {
2646 wakaba 1.8
2647 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2648     line => $self->{line_prev},
2649     column => $self->{column_prev} - 7);
2650     $self->{tainted} = 1;
2651 wakaba 1.8 } else {
2652    
2653 wakaba 1.6 }
2654    
2655 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2656     data => '',
2657     line => $self->{line_prev},
2658     column => $self->{column_prev} - 7};
2659     $self->{state} = CDATA_SECTION_STATE;
2660    
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671     redo A;
2672     } else {
2673    
2674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2675     line => $self->{line_prev},
2676 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2677 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2678     ## Reconsume.
2679     $self->{ct} = {type => COMMENT_TOKEN,
2680 wakaba 1.12 data => $self->{kwd},
2681 wakaba 1.1 line => $self->{line_prev},
2682 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2683 wakaba 1.1 };
2684     redo A;
2685     }
2686     } elsif ($self->{state} == COMMENT_START_STATE) {
2687     if ($self->{nc} == 0x002D) { # -
2688    
2689     $self->{state} = COMMENT_START_DASH_STATE;
2690    
2691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2692     $self->{line_prev} = $self->{line};
2693     $self->{column_prev} = $self->{column};
2694     $self->{column}++;
2695     $self->{nc}
2696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2697     } else {
2698     $self->{set_nc}->($self);
2699     }
2700    
2701     redo A;
2702     } elsif ($self->{nc} == 0x003E) { # >
2703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2704 wakaba 1.13 if ($self->{in_subset}) {
2705    
2706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707     } else {
2708    
2709     $self->{state} = DATA_STATE;
2710     $self->{s_kwd} = '';
2711     }
2712 wakaba 1.1
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723    
2724     return ($self->{ct}); # comment
2725    
2726     redo A;
2727     } elsif ($self->{nc} == -1) {
2728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729 wakaba 1.13 if ($self->{in_subset}) {
2730    
2731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732     } else {
2733    
2734     $self->{state} = DATA_STATE;
2735     $self->{s_kwd} = '';
2736     }
2737 wakaba 1.1 ## reconsume
2738    
2739     return ($self->{ct}); # comment
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{ct}->{data} # comment
2745     .= chr ($self->{nc});
2746     $self->{state} = COMMENT_STATE;
2747    
2748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749     $self->{line_prev} = $self->{line};
2750     $self->{column_prev} = $self->{column};
2751     $self->{column}++;
2752     $self->{nc}
2753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754     } else {
2755     $self->{set_nc}->($self);
2756     }
2757    
2758     redo A;
2759     }
2760     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2761     if ($self->{nc} == 0x002D) { # -
2762    
2763     $self->{state} = COMMENT_END_STATE;
2764    
2765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2766     $self->{line_prev} = $self->{line};
2767     $self->{column_prev} = $self->{column};
2768     $self->{column}++;
2769     $self->{nc}
2770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2771     } else {
2772     $self->{set_nc}->($self);
2773     }
2774    
2775     redo A;
2776     } elsif ($self->{nc} == 0x003E) { # >
2777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2778 wakaba 1.13 if ($self->{in_subset}) {
2779    
2780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781     } else {
2782    
2783     $self->{state} = DATA_STATE;
2784     $self->{s_kwd} = '';
2785     }
2786 wakaba 1.1
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797    
2798     return ($self->{ct}); # comment
2799    
2800     redo A;
2801     } elsif ($self->{nc} == -1) {
2802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 wakaba 1.13 if ($self->{in_subset}) {
2804    
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     } else {
2807    
2808     $self->{state} = DATA_STATE;
2809     $self->{s_kwd} = '';
2810     }
2811 wakaba 1.1 ## reconsume
2812    
2813     return ($self->{ct}); # comment
2814    
2815     redo A;
2816     } else {
2817    
2818     $self->{ct}->{data} # comment
2819     .= '-' . chr ($self->{nc});
2820     $self->{state} = COMMENT_STATE;
2821    
2822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2823     $self->{line_prev} = $self->{line};
2824     $self->{column_prev} = $self->{column};
2825     $self->{column}++;
2826     $self->{nc}
2827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2828     } else {
2829     $self->{set_nc}->($self);
2830     }
2831    
2832     redo A;
2833     }
2834     } elsif ($self->{state} == COMMENT_STATE) {
2835 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2836    
2837 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2838    
2839     $self->{state} = COMMENT_END_DASH_STATE;
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     } elsif ($self->{nc} == -1) {
2853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2854 wakaba 1.13 if ($self->{in_subset}) {
2855    
2856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857     } else {
2858    
2859     $self->{state} = DATA_STATE;
2860     $self->{s_kwd} = '';
2861     }
2862 wakaba 1.1 ## reconsume
2863    
2864     return ($self->{ct}); # comment
2865    
2866     redo A;
2867     } else {
2868    
2869     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2870     $self->{read_until}->($self->{ct}->{data},
2871     q[-],
2872     length $self->{ct}->{data});
2873    
2874     ## Stay in the state
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     }
2888     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2889 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2890 wakaba 1.10
2891 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2892    
2893     $self->{state} = COMMENT_END_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == -1) {
2907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2908 wakaba 1.13 if ($self->{in_subset}) {
2909    
2910     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2911     } else {
2912    
2913     $self->{state} = DATA_STATE;
2914     $self->{s_kwd} = '';
2915     }
2916 wakaba 1.1 ## reconsume
2917    
2918     return ($self->{ct}); # comment
2919    
2920     redo A;
2921     } else {
2922    
2923     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2924     $self->{state} = COMMENT_STATE;
2925    
2926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2927     $self->{line_prev} = $self->{line};
2928     $self->{column_prev} = $self->{column};
2929     $self->{column}++;
2930     $self->{nc}
2931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2932     } else {
2933     $self->{set_nc}->($self);
2934     }
2935    
2936     redo A;
2937     }
2938     } elsif ($self->{state} == COMMENT_END_STATE) {
2939 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2940    
2941 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2942 wakaba 1.13 if ($self->{in_subset}) {
2943    
2944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945     } else {
2946    
2947     $self->{state} = DATA_STATE;
2948     $self->{s_kwd} = '';
2949     }
2950 wakaba 1.1
2951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952     $self->{line_prev} = $self->{line};
2953     $self->{column_prev} = $self->{column};
2954     $self->{column}++;
2955     $self->{nc}
2956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957     } else {
2958     $self->{set_nc}->($self);
2959     }
2960    
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } elsif ($self->{nc} == 0x002D) { # -
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '-'; # comment
2972     ## Stay in the state
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     } elsif ($self->{nc} == -1) {
2986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2987 wakaba 1.13 if ($self->{in_subset}) {
2988    
2989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2990     } else {
2991    
2992     $self->{state} = DATA_STATE;
2993     $self->{s_kwd} = '';
2994     }
2995 wakaba 1.1 ## reconsume
2996    
2997     return ($self->{ct}); # comment
2998    
2999     redo A;
3000     } else {
3001    
3002     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3003     $self->{state} = COMMENT_STATE;
3004    
3005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006     $self->{line_prev} = $self->{line};
3007     $self->{column_prev} = $self->{column};
3008     $self->{column}++;
3009     $self->{nc}
3010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011     } else {
3012     $self->{set_nc}->($self);
3013     }
3014    
3015     redo A;
3016     }
3017     } elsif ($self->{state} == DOCTYPE_STATE) {
3018     if ($is_space->{$self->{nc}}) {
3019    
3020     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3021    
3022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023     $self->{line_prev} = $self->{line};
3024     $self->{column_prev} = $self->{column};
3025     $self->{column}++;
3026     $self->{nc}
3027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028     } else {
3029     $self->{set_nc}->($self);
3030     }
3031    
3032     redo A;
3033 wakaba 1.28 } elsif ($self->{nc} == -1) {
3034    
3035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3036     $self->{ct}->{quirks} = 1;
3037    
3038     $self->{state} = DATA_STATE;
3039     ## Reconsume.
3040     return ($self->{ct}); # DOCTYPE (quirks)
3041    
3042     redo A;
3043 wakaba 1.1 } else {
3044    
3045 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3046 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3047     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3048     ## reconsume
3049     redo A;
3050     }
3051     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3052 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3053    
3054 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3055    
3056     ## Stay in the state
3057    
3058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3059     $self->{line_prev} = $self->{line};
3060     $self->{column_prev} = $self->{column};
3061     $self->{column}++;
3062     $self->{nc}
3063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3064     } else {
3065     $self->{set_nc}->($self);
3066     }
3067    
3068     redo A;
3069     } elsif ($self->{nc} == 0x003E) { # >
3070    
3071 wakaba 1.12 ## XML5: No parse error.
3072 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3073     $self->{state} = DATA_STATE;
3074 wakaba 1.5 $self->{s_kwd} = '';
3075 wakaba 1.1
3076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3077     $self->{line_prev} = $self->{line};
3078     $self->{column_prev} = $self->{column};
3079     $self->{column}++;
3080     $self->{nc}
3081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3082     } else {
3083     $self->{set_nc}->($self);
3084     }
3085    
3086    
3087     return ($self->{ct}); # DOCTYPE (quirks)
3088    
3089     redo A;
3090 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3091    
3092     $self->{ct}->{name} # DOCTYPE
3093     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3094     delete $self->{ct}->{quirks};
3095     $self->{state} = DOCTYPE_NAME_STATE;
3096    
3097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3098     $self->{line_prev} = $self->{line};
3099     $self->{column_prev} = $self->{column};
3100     $self->{column}++;
3101     $self->{nc}
3102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3103     } else {
3104     $self->{set_nc}->($self);
3105     }
3106    
3107     redo A;
3108 wakaba 1.1 } elsif ($self->{nc} == -1) {
3109    
3110     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3111     $self->{state} = DATA_STATE;
3112 wakaba 1.5 $self->{s_kwd} = '';
3113 wakaba 1.1 ## reconsume
3114    
3115     return ($self->{ct}); # DOCTYPE (quirks)
3116    
3117     redo A;
3118 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3119    
3120     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3121     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3122 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3123     $self->{in_subset} = 1;
3124 wakaba 1.12
3125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3126     $self->{line_prev} = $self->{line};
3127     $self->{column_prev} = $self->{column};
3128     $self->{column}++;
3129     $self->{nc}
3130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3131     } else {
3132     $self->{set_nc}->($self);
3133     }
3134    
3135 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3136 wakaba 1.12 redo A;
3137 wakaba 1.1 } else {
3138    
3139     $self->{ct}->{name} = chr $self->{nc};
3140     delete $self->{ct}->{quirks};
3141     $self->{state} = DOCTYPE_NAME_STATE;
3142    
3143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3144     $self->{line_prev} = $self->{line};
3145     $self->{column_prev} = $self->{column};
3146     $self->{column}++;
3147     $self->{nc}
3148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3149     } else {
3150     $self->{set_nc}->($self);
3151     }
3152    
3153     redo A;
3154     }
3155     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3156 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3157    
3158     ## ISSUE: Redundant "First," in the spec.
3159    
3160 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3161    
3162     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3163    
3164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3165     $self->{line_prev} = $self->{line};
3166     $self->{column_prev} = $self->{column};
3167     $self->{column}++;
3168     $self->{nc}
3169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3170     } else {
3171     $self->{set_nc}->($self);
3172     }
3173    
3174     redo A;
3175     } elsif ($self->{nc} == 0x003E) { # >
3176    
3177     $self->{state} = DATA_STATE;
3178 wakaba 1.5 $self->{s_kwd} = '';
3179 wakaba 1.1
3180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3181     $self->{line_prev} = $self->{line};
3182     $self->{column_prev} = $self->{column};
3183     $self->{column}++;
3184     $self->{nc}
3185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3186     } else {
3187     $self->{set_nc}->($self);
3188     }
3189    
3190    
3191     return ($self->{ct}); # DOCTYPE
3192    
3193     redo A;
3194 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3195    
3196     $self->{ct}->{name} # DOCTYPE
3197     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3198     delete $self->{ct}->{quirks};
3199     ## Stay in the state.
3200    
3201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202     $self->{line_prev} = $self->{line};
3203     $self->{column_prev} = $self->{column};
3204     $self->{column}++;
3205     $self->{nc}
3206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207     } else {
3208     $self->{set_nc}->($self);
3209     }
3210    
3211     redo A;
3212 wakaba 1.1 } elsif ($self->{nc} == -1) {
3213    
3214     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3215     $self->{state} = DATA_STATE;
3216 wakaba 1.5 $self->{s_kwd} = '';
3217 wakaba 1.1 ## reconsume
3218    
3219     $self->{ct}->{quirks} = 1;
3220     return ($self->{ct}); # DOCTYPE
3221    
3222     redo A;
3223 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3224    
3225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3226 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3227     $self->{in_subset} = 1;
3228 wakaba 1.12
3229     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3230     $self->{line_prev} = $self->{line};
3231     $self->{column_prev} = $self->{column};
3232     $self->{column}++;
3233     $self->{nc}
3234     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3235     } else {
3236     $self->{set_nc}->($self);
3237     }
3238    
3239 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3240 wakaba 1.12 redo A;
3241 wakaba 1.1 } else {
3242    
3243 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3244     ## Stay in the state.
3245 wakaba 1.1
3246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247     $self->{line_prev} = $self->{line};
3248     $self->{column_prev} = $self->{column};
3249     $self->{column}++;
3250     $self->{nc}
3251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3252     } else {
3253     $self->{set_nc}->($self);
3254     }
3255    
3256     redo A;
3257     }
3258     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3259 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3260     ## state", but implemented differently.
3261    
3262 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3263    
3264     ## Stay in the state
3265    
3266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3267     $self->{line_prev} = $self->{line};
3268     $self->{column_prev} = $self->{column};
3269     $self->{column}++;
3270     $self->{nc}
3271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3272     } else {
3273     $self->{set_nc}->($self);
3274     }
3275    
3276     redo A;
3277     } elsif ($self->{nc} == 0x003E) { # >
3278 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3279    
3280     $self->{state} = DATA_STATE;
3281     $self->{s_kwd} = '';
3282     } else {
3283    
3284     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3285     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3286     }
3287 wakaba 1.1
3288    
3289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3290     $self->{line_prev} = $self->{line};
3291     $self->{column_prev} = $self->{column};
3292     $self->{column}++;
3293     $self->{nc}
3294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3295     } else {
3296     $self->{set_nc}->($self);
3297     }
3298    
3299 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3300 wakaba 1.1 redo A;
3301     } elsif ($self->{nc} == -1) {
3302 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3303    
3304     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3305     $self->{state} = DATA_STATE;
3306     $self->{s_kwd} = '';
3307     $self->{ct}->{quirks} = 1;
3308     } else {
3309    
3310     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3311     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3312     }
3313 wakaba 1.1
3314 wakaba 1.16 ## Reconsume.
3315     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3316 wakaba 1.1 redo A;
3317     } elsif ($self->{nc} == 0x0050 or # P
3318     $self->{nc} == 0x0070) { # p
3319 wakaba 1.12
3320 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3321 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3322 wakaba 1.1
3323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3324     $self->{line_prev} = $self->{line};
3325     $self->{column_prev} = $self->{column};
3326     $self->{column}++;
3327     $self->{nc}
3328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3329     } else {
3330     $self->{set_nc}->($self);
3331     }
3332    
3333     redo A;
3334     } elsif ($self->{nc} == 0x0053 or # S
3335     $self->{nc} == 0x0073) { # s
3336 wakaba 1.12
3337 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3338 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3339    
3340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3341     $self->{line_prev} = $self->{line};
3342     $self->{column_prev} = $self->{column};
3343     $self->{column}++;
3344     $self->{nc}
3345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3346     } else {
3347     $self->{set_nc}->($self);
3348     }
3349    
3350     redo A;
3351 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3352     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3353     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3354    
3355     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3356     $self->{ct}->{value} = ''; # ENTITY
3357    
3358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3359     $self->{line_prev} = $self->{line};
3360     $self->{column_prev} = $self->{column};
3361     $self->{column}++;
3362     $self->{nc}
3363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3364     } else {
3365     $self->{set_nc}->($self);
3366     }
3367    
3368     redo A;
3369     } elsif ($self->{nc} == 0x0027 and # '
3370     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3371     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3372    
3373     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3374     $self->{ct}->{value} = ''; # ENTITY
3375    
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.16 } elsif ($self->{is_xml} and
3388     $self->{ct}->{type} == DOCTYPE_TOKEN and
3389     $self->{nc} == 0x005B) { # [
3390 wakaba 1.12
3391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3392     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3393 wakaba 1.13 $self->{in_subset} = 1;
3394 wakaba 1.1
3395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396     $self->{line_prev} = $self->{line};
3397     $self->{column_prev} = $self->{column};
3398     $self->{column}++;
3399     $self->{nc}
3400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401     } else {
3402     $self->{set_nc}->($self);
3403     }
3404    
3405 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3406 wakaba 1.1 redo A;
3407     } else {
3408 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3409    
3410     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3411    
3412     $self->{ct}->{quirks} = 1;
3413     $self->{state} = BOGUS_DOCTYPE_STATE;
3414     } else {
3415    
3416     $self->{state} = BOGUS_MD_STATE;
3417     }
3418 wakaba 1.1
3419    
3420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3421     $self->{line_prev} = $self->{line};
3422     $self->{column_prev} = $self->{column};
3423     $self->{column}++;
3424     $self->{nc}
3425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3426     } else {
3427     $self->{set_nc}->($self);
3428     }
3429    
3430     redo A;
3431     }
3432     } elsif ($self->{state} == PUBLIC_STATE) {
3433     ## ASCII case-insensitive
3434     if ($self->{nc} == [
3435     undef,
3436     0x0055, # U
3437     0x0042, # B
3438     0x004C, # L
3439     0x0049, # I
3440 wakaba 1.12 ]->[length $self->{kwd}] or
3441 wakaba 1.1 $self->{nc} == [
3442     undef,
3443     0x0075, # u
3444     0x0062, # b
3445     0x006C, # l
3446     0x0069, # i
3447 wakaba 1.12 ]->[length $self->{kwd}]) {
3448 wakaba 1.1
3449     ## Stay in the state.
3450 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3451 wakaba 1.1
3452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3453     $self->{line_prev} = $self->{line};
3454     $self->{column_prev} = $self->{column};
3455     $self->{column}++;
3456     $self->{nc}
3457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3458     } else {
3459     $self->{set_nc}->($self);
3460     }
3461    
3462     redo A;
3463 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3464 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3465     $self->{nc} == 0x0063)) { # c
3466 wakaba 1.12 if ($self->{is_xml} and
3467     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3468    
3469     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3470     text => 'PUBLIC',
3471     line => $self->{line_prev},
3472     column => $self->{column_prev} - 4);
3473     } else {
3474    
3475     }
3476 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3477    
3478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3479     $self->{line_prev} = $self->{line};
3480     $self->{column_prev} = $self->{column};
3481     $self->{column}++;
3482     $self->{nc}
3483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3484     } else {
3485     $self->{set_nc}->($self);
3486     }
3487    
3488     redo A;
3489     } else {
3490 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3491 wakaba 1.1 line => $self->{line_prev},
3492 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3493 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3494    
3495     $self->{ct}->{quirks} = 1;
3496     $self->{state} = BOGUS_DOCTYPE_STATE;
3497     } else {
3498    
3499     $self->{state} = BOGUS_MD_STATE;
3500     }
3501 wakaba 1.1 ## Reconsume.
3502     redo A;
3503     }
3504     } elsif ($self->{state} == SYSTEM_STATE) {
3505     ## ASCII case-insensitive
3506     if ($self->{nc} == [
3507     undef,
3508     0x0059, # Y
3509     0x0053, # S
3510     0x0054, # T
3511     0x0045, # E
3512 wakaba 1.12 ]->[length $self->{kwd}] or
3513 wakaba 1.1 $self->{nc} == [
3514     undef,
3515     0x0079, # y
3516     0x0073, # s
3517     0x0074, # t
3518     0x0065, # e
3519 wakaba 1.12 ]->[length $self->{kwd}]) {
3520 wakaba 1.1
3521     ## Stay in the state.
3522 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3523 wakaba 1.1
3524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3525     $self->{line_prev} = $self->{line};
3526     $self->{column_prev} = $self->{column};
3527     $self->{column}++;
3528     $self->{nc}
3529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3530     } else {
3531     $self->{set_nc}->($self);
3532     }
3533    
3534     redo A;
3535 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3536 wakaba 1.1 ($self->{nc} == 0x004D or # M
3537     $self->{nc} == 0x006D)) { # m
3538 wakaba 1.12 if ($self->{is_xml} and
3539     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3540    
3541     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3542     text => 'SYSTEM',
3543     line => $self->{line_prev},
3544     column => $self->{column_prev} - 4);
3545     } else {
3546    
3547     }
3548 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3549    
3550     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3551     $self->{line_prev} = $self->{line};
3552     $self->{column_prev} = $self->{column};
3553     $self->{column}++;
3554     $self->{nc}
3555     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3556     } else {
3557     $self->{set_nc}->($self);
3558     }
3559    
3560     redo A;
3561     } else {
3562 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3563 wakaba 1.1 line => $self->{line_prev},
3564 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3565 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3566    
3567     $self->{ct}->{quirks} = 1;
3568     $self->{state} = BOGUS_DOCTYPE_STATE;
3569     } else {
3570    
3571     $self->{state} = BOGUS_MD_STATE;
3572     }
3573 wakaba 1.1 ## Reconsume.
3574     redo A;
3575     }
3576     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3577     if ($is_space->{$self->{nc}}) {
3578    
3579     ## Stay in the state
3580    
3581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3582     $self->{line_prev} = $self->{line};
3583     $self->{column_prev} = $self->{column};
3584     $self->{column}++;
3585     $self->{nc}
3586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3587     } else {
3588     $self->{set_nc}->($self);
3589     }
3590    
3591     redo A;
3592     } elsif ($self->{nc} eq 0x0022) { # "
3593    
3594     $self->{ct}->{pubid} = ''; # DOCTYPE
3595     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3596    
3597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3598     $self->{line_prev} = $self->{line};
3599     $self->{column_prev} = $self->{column};
3600     $self->{column}++;
3601     $self->{nc}
3602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3603     } else {
3604     $self->{set_nc}->($self);
3605     }
3606    
3607     redo A;
3608     } elsif ($self->{nc} eq 0x0027) { # '
3609    
3610     $self->{ct}->{pubid} = ''; # DOCTYPE
3611     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3612    
3613     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3614     $self->{line_prev} = $self->{line};
3615     $self->{column_prev} = $self->{column};
3616     $self->{column}++;
3617     $self->{nc}
3618     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3619     } else {
3620     $self->{set_nc}->($self);
3621     }
3622    
3623     redo A;
3624     } elsif ($self->{nc} eq 0x003E) { # >
3625 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3626    
3627     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3628    
3629     $self->{state} = DATA_STATE;
3630     $self->{s_kwd} = '';
3631     $self->{ct}->{quirks} = 1;
3632     } else {
3633    
3634     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635     }
3636 wakaba 1.1
3637    
3638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639     $self->{line_prev} = $self->{line};
3640     $self->{column_prev} = $self->{column};
3641     $self->{column}++;
3642     $self->{nc}
3643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644     } else {
3645     $self->{set_nc}->($self);
3646     }
3647    
3648 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3649 wakaba 1.1 redo A;
3650     } elsif ($self->{nc} == -1) {
3651 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3652    
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3654     $self->{state} = DATA_STATE;
3655     $self->{s_kwd} = '';
3656     $self->{ct}->{quirks} = 1;
3657     } else {
3658    
3659     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3660     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3661     }
3662 wakaba 1.1
3663     ## reconsume
3664     return ($self->{ct}); # DOCTYPE
3665     redo A;
3666 wakaba 1.16 } elsif ($self->{is_xml} and
3667     $self->{ct}->{type} == DOCTYPE_TOKEN and
3668     $self->{nc} == 0x005B) { # [
3669 wakaba 1.12
3670     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3671     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3672     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3673 wakaba 1.13 $self->{in_subset} = 1;
3674 wakaba 1.12
3675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3676     $self->{line_prev} = $self->{line};
3677     $self->{column_prev} = $self->{column};
3678     $self->{column}++;
3679     $self->{nc}
3680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3681     } else {
3682     $self->{set_nc}->($self);
3683     }
3684    
3685 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3686 wakaba 1.12 redo A;
3687 wakaba 1.1 } else {
3688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3689    
3690 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3691    
3692     $self->{ct}->{quirks} = 1;
3693     $self->{state} = BOGUS_DOCTYPE_STATE;
3694     } else {
3695    
3696     $self->{state} = BOGUS_MD_STATE;
3697     }
3698    
3699 wakaba 1.1
3700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3701     $self->{line_prev} = $self->{line};
3702     $self->{column_prev} = $self->{column};
3703     $self->{column}++;
3704     $self->{nc}
3705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3706     } else {
3707     $self->{set_nc}->($self);
3708     }
3709    
3710     redo A;
3711     }
3712     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3713     if ($self->{nc} == 0x0022) { # "
3714    
3715     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3716    
3717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3718     $self->{line_prev} = $self->{line};
3719     $self->{column_prev} = $self->{column};
3720     $self->{column}++;
3721     $self->{nc}
3722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3723     } else {
3724     $self->{set_nc}->($self);
3725     }
3726    
3727     redo A;
3728     } elsif ($self->{nc} == 0x003E) { # >
3729     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3730    
3731 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3732    
3733     $self->{state} = DATA_STATE;
3734     $self->{s_kwd} = '';
3735     $self->{ct}->{quirks} = 1;
3736     } else {
3737    
3738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3739     }
3740    
3741 wakaba 1.1
3742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3743     $self->{line_prev} = $self->{line};
3744     $self->{column_prev} = $self->{column};
3745     $self->{column}++;
3746     $self->{nc}
3747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3748     } else {
3749     $self->{set_nc}->($self);
3750     }
3751    
3752 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3753 wakaba 1.1 redo A;
3754     } elsif ($self->{nc} == -1) {
3755     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3756    
3757 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3758    
3759     $self->{state} = DATA_STATE;
3760     $self->{s_kwd} = '';
3761     $self->{ct}->{quirks} = 1;
3762     } else {
3763    
3764     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3765     }
3766    
3767     ## Reconsume.
3768 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3769     redo A;
3770     } else {
3771    
3772 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3773 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3774     length $self->{ct}->{pubid});
3775    
3776     ## Stay in the state
3777    
3778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779     $self->{line_prev} = $self->{line};
3780     $self->{column_prev} = $self->{column};
3781     $self->{column}++;
3782     $self->{nc}
3783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784     } else {
3785     $self->{set_nc}->($self);
3786     }
3787    
3788     redo A;
3789     }
3790     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3791     if ($self->{nc} == 0x0027) { # '
3792    
3793     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3794    
3795     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3796     $self->{line_prev} = $self->{line};
3797     $self->{column_prev} = $self->{column};
3798     $self->{column}++;
3799     $self->{nc}
3800     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3801     } else {
3802     $self->{set_nc}->($self);
3803     }
3804    
3805     redo A;
3806     } elsif ($self->{nc} == 0x003E) { # >
3807     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3808    
3809 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3810    
3811     $self->{state} = DATA_STATE;
3812     $self->{s_kwd} = '';
3813     $self->{ct}->{quirks} = 1;
3814     } else {
3815    
3816     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3817     }
3818    
3819 wakaba 1.1
3820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3821     $self->{line_prev} = $self->{line};
3822     $self->{column_prev} = $self->{column};
3823     $self->{column}++;
3824     $self->{nc}
3825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3826     } else {
3827     $self->{set_nc}->($self);
3828     }
3829    
3830 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3831 wakaba 1.1 redo A;
3832     } elsif ($self->{nc} == -1) {
3833     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3834    
3835 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3836    
3837     $self->{state} = DATA_STATE;
3838     $self->{s_kwd} = '';
3839     $self->{ct}->{quirks} = 1;
3840     } else {
3841    
3842     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3843     }
3844    
3845 wakaba 1.1 ## reconsume
3846 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3847 wakaba 1.1 redo A;
3848     } else {
3849    
3850 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3851 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3852     length $self->{ct}->{pubid});
3853    
3854     ## Stay in the state
3855    
3856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3857     $self->{line_prev} = $self->{line};
3858     $self->{column_prev} = $self->{column};
3859     $self->{column}++;
3860     $self->{nc}
3861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3862     } else {
3863     $self->{set_nc}->($self);
3864     }
3865    
3866     redo A;
3867     }
3868     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3869     if ($is_space->{$self->{nc}}) {
3870    
3871     ## Stay in the state
3872    
3873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3874     $self->{line_prev} = $self->{line};
3875     $self->{column_prev} = $self->{column};
3876     $self->{column}++;
3877     $self->{nc}
3878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3879     } else {
3880     $self->{set_nc}->($self);
3881     }
3882    
3883     redo A;
3884     } elsif ($self->{nc} == 0x0022) { # "
3885    
3886 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3887 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3888    
3889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3890     $self->{line_prev} = $self->{line};
3891     $self->{column_prev} = $self->{column};
3892     $self->{column}++;
3893     $self->{nc}
3894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3895     } else {
3896     $self->{set_nc}->($self);
3897     }
3898    
3899     redo A;
3900     } elsif ($self->{nc} == 0x0027) { # '
3901    
3902 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3903 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3904    
3905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3906     $self->{line_prev} = $self->{line};
3907     $self->{column_prev} = $self->{column};
3908     $self->{column}++;
3909     $self->{nc}
3910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3911     } else {
3912     $self->{set_nc}->($self);
3913     }
3914    
3915     redo A;
3916     } elsif ($self->{nc} == 0x003E) { # >
3917 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3918     if ($self->{is_xml}) {
3919    
3920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3921     } else {
3922    
3923     }
3924     $self->{state} = DATA_STATE;
3925     $self->{s_kwd} = '';
3926 wakaba 1.12 } else {
3927 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3928    
3929     } else {
3930    
3931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3932     }
3933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3934 wakaba 1.12 }
3935 wakaba 1.16
3936 wakaba 1.1
3937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3938     $self->{line_prev} = $self->{line};
3939     $self->{column_prev} = $self->{column};
3940     $self->{column}++;
3941     $self->{nc}
3942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3943     } else {
3944     $self->{set_nc}->($self);
3945     }
3946    
3947 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3948 wakaba 1.1 redo A;
3949     } elsif ($self->{nc} == -1) {
3950 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3951    
3952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3953    
3954     $self->{state} = DATA_STATE;
3955     $self->{s_kwd} = '';
3956     $self->{ct}->{quirks} = 1;
3957     } else {
3958     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3959     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3960     }
3961 wakaba 1.1
3962     ## reconsume
3963 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3964 wakaba 1.1 redo A;
3965 wakaba 1.16 } elsif ($self->{is_xml} and
3966     $self->{ct}->{type} == DOCTYPE_TOKEN and
3967     $self->{nc} == 0x005B) { # [
3968 wakaba 1.12
3969     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3970     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3971     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3972 wakaba 1.13 $self->{in_subset} = 1;
3973 wakaba 1.12
3974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3975     $self->{line_prev} = $self->{line};
3976     $self->{column_prev} = $self->{column};
3977     $self->{column}++;
3978     $self->{nc}
3979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3980     } else {
3981     $self->{set_nc}->($self);
3982     }
3983    
3984 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3985 wakaba 1.12 redo A;
3986 wakaba 1.1 } else {
3987     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3988    
3989 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3990    
3991     $self->{ct}->{quirks} = 1;
3992     $self->{state} = BOGUS_DOCTYPE_STATE;
3993     } else {
3994    
3995     $self->{state} = BOGUS_MD_STATE;
3996     }
3997    
3998 wakaba 1.1
3999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4000     $self->{line_prev} = $self->{line};
4001     $self->{column_prev} = $self->{column};
4002     $self->{column}++;
4003     $self->{nc}
4004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4005     } else {
4006     $self->{set_nc}->($self);
4007     }
4008    
4009     redo A;
4010     }
4011     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4012     if ($is_space->{$self->{nc}}) {
4013    
4014     ## Stay in the state
4015    
4016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4017     $self->{line_prev} = $self->{line};
4018     $self->{column_prev} = $self->{column};
4019     $self->{column}++;
4020     $self->{nc}
4021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4022     } else {
4023     $self->{set_nc}->($self);
4024     }
4025    
4026     redo A;
4027     } elsif ($self->{nc} == 0x0022) { # "
4028    
4029     $self->{ct}->{sysid} = ''; # DOCTYPE
4030     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4031    
4032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4033     $self->{line_prev} = $self->{line};
4034     $self->{column_prev} = $self->{column};
4035     $self->{column}++;
4036     $self->{nc}
4037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4038     } else {
4039     $self->{set_nc}->($self);
4040     }
4041    
4042     redo A;
4043     } elsif ($self->{nc} == 0x0027) { # '
4044    
4045     $self->{ct}->{sysid} = ''; # DOCTYPE
4046     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4047    
4048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4049     $self->{line_prev} = $self->{line};
4050     $self->{column_prev} = $self->{column};
4051     $self->{column}++;
4052     $self->{nc}
4053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4054     } else {
4055     $self->{set_nc}->($self);
4056     }
4057    
4058     redo A;
4059     } elsif ($self->{nc} == 0x003E) { # >
4060     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4061    
4062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4063     $self->{line_prev} = $self->{line};
4064     $self->{column_prev} = $self->{column};
4065     $self->{column}++;
4066     $self->{nc}
4067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4068     } else {
4069     $self->{set_nc}->($self);
4070     }
4071    
4072    
4073 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4074    
4075     $self->{state} = DATA_STATE;
4076     $self->{s_kwd} = '';
4077     $self->{ct}->{quirks} = 1;
4078     } else {
4079    
4080     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4081     }
4082 wakaba 1.1
4083 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4084 wakaba 1.1 redo A;
4085     } elsif ($self->{nc} == -1) {
4086 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4087    
4088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4089     $self->{state} = DATA_STATE;
4090     $self->{s_kwd} = '';
4091     $self->{ct}->{quirks} = 1;
4092     } else {
4093    
4094     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4095     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4096     }
4097 wakaba 1.1
4098     ## reconsume
4099 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4100 wakaba 1.1 redo A;
4101 wakaba 1.16 } elsif ($self->{is_xml} and
4102     $self->{ct}->{type} == DOCTYPE_TOKEN and
4103     $self->{nc} == 0x005B) { # [
4104 wakaba 1.12
4105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4106    
4107     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4108     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4109 wakaba 1.13 $self->{in_subset} = 1;
4110 wakaba 1.12
4111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4112     $self->{line_prev} = $self->{line};
4113     $self->{column_prev} = $self->{column};
4114     $self->{column}++;
4115     $self->{nc}
4116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4117     } else {
4118     $self->{set_nc}->($self);
4119     }
4120    
4121 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4122 wakaba 1.12 redo A;
4123 wakaba 1.1 } else {
4124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4125    
4126 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4127    
4128     $self->{ct}->{quirks} = 1;
4129     $self->{state} = BOGUS_DOCTYPE_STATE;
4130     } else {
4131    
4132     $self->{state} = BOGUS_MD_STATE;
4133     }
4134    
4135 wakaba 1.1
4136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4137     $self->{line_prev} = $self->{line};
4138     $self->{column_prev} = $self->{column};
4139     $self->{column}++;
4140     $self->{nc}
4141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4142     } else {
4143     $self->{set_nc}->($self);
4144     }
4145    
4146     redo A;
4147     }
4148     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4149     if ($self->{nc} == 0x0022) { # "
4150    
4151     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4152    
4153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4154     $self->{line_prev} = $self->{line};
4155     $self->{column_prev} = $self->{column};
4156     $self->{column}++;
4157     $self->{nc}
4158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4159     } else {
4160     $self->{set_nc}->($self);
4161     }
4162    
4163     redo A;
4164 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4165 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4166    
4167 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4168    
4169     $self->{state} = DATA_STATE;
4170     $self->{s_kwd} = '';
4171     $self->{ct}->{quirks} = 1;
4172     } else {
4173    
4174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4175     }
4176    
4177 wakaba 1.1
4178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4179     $self->{line_prev} = $self->{line};
4180     $self->{column_prev} = $self->{column};
4181     $self->{column}++;
4182     $self->{nc}
4183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4184     } else {
4185     $self->{set_nc}->($self);
4186     }
4187    
4188 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4189 wakaba 1.1 redo A;
4190     } elsif ($self->{nc} == -1) {
4191     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4192    
4193 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4194    
4195     $self->{state} = DATA_STATE;
4196     $self->{s_kwd} = '';
4197     $self->{ct}->{quirks} = 1;
4198     } else {
4199    
4200     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4201     }
4202    
4203 wakaba 1.1 ## reconsume
4204 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4205 wakaba 1.1 redo A;
4206     } else {
4207    
4208 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4209 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4210     length $self->{ct}->{sysid});
4211    
4212     ## Stay in the state
4213    
4214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215     $self->{line_prev} = $self->{line};
4216     $self->{column_prev} = $self->{column};
4217     $self->{column}++;
4218     $self->{nc}
4219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220     } else {
4221     $self->{set_nc}->($self);
4222     }
4223    
4224     redo A;
4225     }
4226     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4227     if ($self->{nc} == 0x0027) { # '
4228    
4229     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4230    
4231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232     $self->{line_prev} = $self->{line};
4233     $self->{column_prev} = $self->{column};
4234     $self->{column}++;
4235     $self->{nc}
4236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4237     } else {
4238     $self->{set_nc}->($self);
4239     }
4240    
4241     redo A;
4242 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4243 wakaba 1.1
4244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4245    
4246     $self->{state} = DATA_STATE;
4247 wakaba 1.5 $self->{s_kwd} = '';
4248 wakaba 1.1
4249     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4250     $self->{line_prev} = $self->{line};
4251     $self->{column_prev} = $self->{column};
4252     $self->{column}++;
4253     $self->{nc}
4254     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4255     } else {
4256     $self->{set_nc}->($self);
4257     }
4258    
4259    
4260     $self->{ct}->{quirks} = 1;
4261     return ($self->{ct}); # DOCTYPE
4262    
4263     redo A;
4264     } elsif ($self->{nc} == -1) {
4265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4266    
4267 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4268    
4269     $self->{state} = DATA_STATE;
4270     $self->{s_kwd} = '';
4271     $self->{ct}->{quirks} = 1;
4272     } else {
4273    
4274     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275     }
4276    
4277 wakaba 1.1 ## reconsume
4278 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4279 wakaba 1.1 redo A;
4280     } else {
4281    
4282 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4283 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4284     length $self->{ct}->{sysid});
4285    
4286     ## Stay in the state
4287    
4288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4289     $self->{line_prev} = $self->{line};
4290     $self->{column_prev} = $self->{column};
4291     $self->{column}++;
4292     $self->{nc}
4293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4294     } else {
4295     $self->{set_nc}->($self);
4296     }
4297    
4298     redo A;
4299     }
4300     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4301     if ($is_space->{$self->{nc}}) {
4302 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4303    
4304     $self->{state} = BEFORE_NDATA_STATE;
4305     } else {
4306    
4307     ## Stay in the state
4308     }
4309 wakaba 1.1
4310     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4311     $self->{line_prev} = $self->{line};
4312     $self->{column_prev} = $self->{column};
4313     $self->{column}++;
4314     $self->{nc}
4315     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4316     } else {
4317     $self->{set_nc}->($self);
4318     }
4319    
4320     redo A;
4321     } elsif ($self->{nc} == 0x003E) { # >
4322 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4323    
4324     $self->{state} = DATA_STATE;
4325     $self->{s_kwd} = '';
4326     } else {
4327    
4328     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4329     }
4330    
4331 wakaba 1.1
4332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4333     $self->{line_prev} = $self->{line};
4334     $self->{column_prev} = $self->{column};
4335     $self->{column}++;
4336     $self->{nc}
4337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4338     } else {
4339     $self->{set_nc}->($self);
4340     }
4341    
4342 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4343 wakaba 1.1 redo A;
4344 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4345     ($self->{nc} == 0x004E or # N
4346     $self->{nc} == 0x006E)) { # n
4347    
4348     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4349     $self->{state} = NDATA_STATE;
4350     $self->{kwd} = chr $self->{nc};
4351    
4352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4353     $self->{line_prev} = $self->{line};
4354     $self->{column_prev} = $self->{column};
4355     $self->{column}++;
4356     $self->{nc}
4357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4358     } else {
4359     $self->{set_nc}->($self);
4360     }
4361    
4362     redo A;
4363 wakaba 1.1 } elsif ($self->{nc} == -1) {
4364 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4365    
4366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4367     $self->{state} = DATA_STATE;
4368     $self->{s_kwd} = '';
4369     $self->{ct}->{quirks} = 1;
4370     } else {
4371    
4372     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4373     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4374     }
4375    
4376 wakaba 1.1 ## reconsume
4377 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4378 wakaba 1.1 redo A;
4379 wakaba 1.16 } elsif ($self->{is_xml} and
4380     $self->{ct}->{type} == DOCTYPE_TOKEN and
4381     $self->{nc} == 0x005B) { # [
4382 wakaba 1.12
4383     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4384     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4385 wakaba 1.13 $self->{in_subset} = 1;
4386 wakaba 1.12
4387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4388     $self->{line_prev} = $self->{line};
4389     $self->{column_prev} = $self->{column};
4390     $self->{column}++;
4391     $self->{nc}
4392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4393     } else {
4394     $self->{set_nc}->($self);
4395     }
4396    
4397 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4398 wakaba 1.12 redo A;
4399 wakaba 1.1 } else {
4400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4401    
4402 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4403    
4404     #$self->{ct}->{quirks} = 1;
4405     $self->{state} = BOGUS_DOCTYPE_STATE;
4406     } else {
4407    
4408     $self->{state} = BOGUS_MD_STATE;
4409     }
4410    
4411 wakaba 1.1
4412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4413     $self->{line_prev} = $self->{line};
4414     $self->{column_prev} = $self->{column};
4415     $self->{column}++;
4416     $self->{nc}
4417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4418     } else {
4419     $self->{set_nc}->($self);
4420     }
4421    
4422     redo A;
4423     }
4424 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4425     if ($is_space->{$self->{nc}}) {
4426    
4427     ## Stay in the state.
4428    
4429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4430     $self->{line_prev} = $self->{line};
4431     $self->{column_prev} = $self->{column};
4432     $self->{column}++;
4433     $self->{nc}
4434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4435     } else {
4436     $self->{set_nc}->($self);
4437     }
4438    
4439     redo A;
4440     } elsif ($self->{nc} == 0x003E) { # >
4441    
4442     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4443    
4444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4445     $self->{line_prev} = $self->{line};
4446     $self->{column_prev} = $self->{column};
4447     $self->{column}++;
4448     $self->{nc}
4449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4450     } else {
4451     $self->{set_nc}->($self);
4452     }
4453    
4454     return ($self->{ct}); # ENTITY
4455     redo A;
4456     } elsif ($self->{nc} == 0x004E or # N
4457     $self->{nc} == 0x006E) { # n
4458    
4459     $self->{state} = NDATA_STATE;
4460     $self->{kwd} = chr $self->{nc};
4461    
4462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4463     $self->{line_prev} = $self->{line};
4464     $self->{column_prev} = $self->{column};
4465     $self->{column}++;
4466     $self->{nc}
4467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4468     } else {
4469     $self->{set_nc}->($self);
4470     }
4471    
4472     redo A;
4473     } elsif ($self->{nc} == -1) {
4474    
4475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4477     ## reconsume
4478     return ($self->{ct}); # ENTITY
4479     redo A;
4480     } else {
4481    
4482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4483     $self->{state} = BOGUS_MD_STATE;
4484    
4485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4486     $self->{line_prev} = $self->{line};
4487     $self->{column_prev} = $self->{column};
4488     $self->{column}++;
4489     $self->{nc}
4490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4491     } else {
4492     $self->{set_nc}->($self);
4493     }
4494    
4495     redo A;
4496     }
4497 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4498     if ($self->{nc} == 0x003E) { # >
4499    
4500     $self->{state} = DATA_STATE;
4501 wakaba 1.5 $self->{s_kwd} = '';
4502 wakaba 1.1
4503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4504     $self->{line_prev} = $self->{line};
4505     $self->{column_prev} = $self->{column};
4506     $self->{column}++;
4507     $self->{nc}
4508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4509     } else {
4510     $self->{set_nc}->($self);
4511     }
4512    
4513    
4514     return ($self->{ct}); # DOCTYPE
4515    
4516     redo A;
4517 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4518 wakaba 1.13
4519     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4520     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4521     $self->{in_subset} = 1;
4522    
4523 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4524     $self->{line_prev} = $self->{line};
4525     $self->{column_prev} = $self->{column};
4526     $self->{column}++;
4527     $self->{nc}
4528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4529     } else {
4530     $self->{set_nc}->($self);
4531     }
4532    
4533 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4534     redo A;
4535 wakaba 1.1 } elsif ($self->{nc} == -1) {
4536    
4537     $self->{state} = DATA_STATE;
4538 wakaba 1.5 $self->{s_kwd} = '';
4539 wakaba 1.1 ## reconsume
4540    
4541     return ($self->{ct}); # DOCTYPE
4542    
4543     redo A;
4544     } else {
4545    
4546     my $s = '';
4547 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4548 wakaba 1.1
4549     ## Stay in the state
4550    
4551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4552     $self->{line_prev} = $self->{line};
4553     $self->{column_prev} = $self->{column};
4554     $self->{column}++;
4555     $self->{nc}
4556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4557     } else {
4558     $self->{set_nc}->($self);
4559     }
4560    
4561     redo A;
4562     }
4563     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4564     ## NOTE: "CDATA section state" in the state is jointly implemented
4565     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4566     ## and |CDATA_SECTION_MSE2_STATE|.
4567 wakaba 1.10
4568     ## XML5: "CDATA state".
4569 wakaba 1.1
4570     if ($self->{nc} == 0x005D) { # ]
4571    
4572     $self->{state} = CDATA_SECTION_MSE1_STATE;
4573    
4574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4575     $self->{line_prev} = $self->{line};
4576     $self->{column_prev} = $self->{column};
4577     $self->{column}++;
4578     $self->{nc}
4579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4580     } else {
4581     $self->{set_nc}->($self);
4582     }
4583    
4584     redo A;
4585     } elsif ($self->{nc} == -1) {
4586 wakaba 1.6 if ($self->{is_xml}) {
4587 wakaba 1.8
4588 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4589 wakaba 1.8 } else {
4590    
4591 wakaba 1.6 }
4592    
4593 wakaba 1.1 $self->{state} = DATA_STATE;
4594 wakaba 1.5 $self->{s_kwd} = '';
4595 wakaba 1.10 ## Reconsume.
4596 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4597    
4598     return ($self->{ct}); # character
4599     } else {
4600    
4601     ## No token to emit. $self->{ct} is discarded.
4602     }
4603     redo A;
4604     } else {
4605    
4606     $self->{ct}->{data} .= chr $self->{nc};
4607     $self->{read_until}->($self->{ct}->{data},
4608     q<]>,
4609     length $self->{ct}->{data});
4610    
4611     ## Stay in the state.
4612    
4613     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4614     $self->{line_prev} = $self->{line};
4615     $self->{column_prev} = $self->{column};
4616     $self->{column}++;
4617     $self->{nc}
4618     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4619     } else {
4620     $self->{set_nc}->($self);
4621     }
4622    
4623     redo A;
4624     }
4625    
4626     ## ISSUE: "text tokens" in spec.
4627     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4628 wakaba 1.10 ## XML5: "CDATA bracket state".
4629    
4630 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4631    
4632     $self->{state} = CDATA_SECTION_MSE2_STATE;
4633    
4634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635     $self->{line_prev} = $self->{line};
4636     $self->{column_prev} = $self->{column};
4637     $self->{column}++;
4638     $self->{nc}
4639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4640     } else {
4641     $self->{set_nc}->($self);
4642     }
4643    
4644     redo A;
4645     } else {
4646    
4647 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4648 wakaba 1.1 $self->{ct}->{data} .= ']';
4649 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4650 wakaba 1.1 ## Reconsume.
4651     redo A;
4652     }
4653     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4654 wakaba 1.10 ## XML5: "CDATA end state".
4655    
4656 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4657     $self->{state} = DATA_STATE;
4658 wakaba 1.5 $self->{s_kwd} = '';
4659 wakaba 1.1
4660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4661     $self->{line_prev} = $self->{line};
4662     $self->{column_prev} = $self->{column};
4663     $self->{column}++;
4664     $self->{nc}
4665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4666     } else {
4667     $self->{set_nc}->($self);
4668     }
4669    
4670     if (length $self->{ct}->{data}) { # character
4671    
4672     return ($self->{ct}); # character
4673     } else {
4674    
4675     ## No token to emit. $self->{ct} is discarded.
4676     }
4677     redo A;
4678     } elsif ($self->{nc} == 0x005D) { # ]
4679     # character
4680     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4681     ## Stay in the state.
4682    
4683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4684     $self->{line_prev} = $self->{line};
4685     $self->{column_prev} = $self->{column};
4686     $self->{column}++;
4687     $self->{nc}
4688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4689     } else {
4690     $self->{set_nc}->($self);
4691     }
4692    
4693     redo A;
4694     } else {
4695    
4696     $self->{ct}->{data} .= ']]'; # character
4697     $self->{state} = CDATA_SECTION_STATE;
4698 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4699 wakaba 1.1 redo A;
4700     }
4701     } elsif ($self->{state} == ENTITY_STATE) {
4702     if ($is_space->{$self->{nc}} or
4703     {
4704     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4705     $self->{entity_add} => 1,
4706     }->{$self->{nc}}) {
4707 wakaba 1.22 if ($self->{is_xml}) {
4708    
4709     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4710     line => $self->{line_prev},
4711     column => $self->{column_prev}
4712     + ($self->{nc} == -1 ? 1 : 0));
4713     } else {
4714    
4715     ## No error
4716     }
4717 wakaba 1.1 ## Don't consume
4718     ## Return nothing.
4719     #
4720     } elsif ($self->{nc} == 0x0023) { # #
4721    
4722     $self->{state} = ENTITY_HASH_STATE;
4723 wakaba 1.12 $self->{kwd} = '#';
4724 wakaba 1.1
4725     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4726     $self->{line_prev} = $self->{line};
4727     $self->{column_prev} = $self->{column};
4728     $self->{column}++;
4729     $self->{nc}
4730     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4731     } else {
4732     $self->{set_nc}->($self);
4733     }
4734    
4735     redo A;
4736 wakaba 1.22 } elsif ($self->{is_xml} or
4737     (0x0041 <= $self->{nc} and
4738 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4739     (0x0061 <= $self->{nc} and
4740     $self->{nc} <= 0x007A)) { # a..z
4741    
4742     require Whatpm::_NamedEntityList;
4743     $self->{state} = ENTITY_NAME_STATE;
4744 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4745     $self->{entity__value} = $self->{kwd};
4746 wakaba 1.1 $self->{entity__match} = 0;
4747    
4748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4749     $self->{line_prev} = $self->{line};
4750     $self->{column_prev} = $self->{column};
4751     $self->{column}++;
4752     $self->{nc}
4753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4754     } else {
4755     $self->{set_nc}->($self);
4756     }
4757    
4758     redo A;
4759     } else {
4760    
4761     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4762     ## Return nothing.
4763     #
4764     }
4765    
4766     ## NOTE: No character is consumed by the "consume a character
4767     ## reference" algorithm. In other word, there is an "&" character
4768     ## that does not introduce a character reference, which would be
4769     ## appended to the parent element or the attribute value in later
4770     ## process of the tokenizer.
4771    
4772     if ($self->{prev_state} == DATA_STATE) {
4773    
4774     $self->{state} = $self->{prev_state};
4775 wakaba 1.5 $self->{s_kwd} = '';
4776 wakaba 1.1 ## Reconsume.
4777     return ({type => CHARACTER_TOKEN, data => '&',
4778     line => $self->{line_prev},
4779     column => $self->{column_prev},
4780     });
4781     redo A;
4782     } else {
4783    
4784     $self->{ca}->{value} .= '&';
4785     $self->{state} = $self->{prev_state};
4786 wakaba 1.5 $self->{s_kwd} = '';
4787 wakaba 1.1 ## Reconsume.
4788     redo A;
4789     }
4790     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4791 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4792 wakaba 1.1
4793     $self->{state} = HEXREF_X_STATE;
4794 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4795 wakaba 1.1
4796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4797     $self->{line_prev} = $self->{line};
4798     $self->{column_prev} = $self->{column};
4799     $self->{column}++;
4800     $self->{nc}
4801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4802     } else {
4803     $self->{set_nc}->($self);
4804     }
4805    
4806     redo A;
4807 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4808    
4809     if ($self->{is_xml}) {
4810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4811     }
4812     $self->{state} = HEXREF_X_STATE;
4813     $self->{kwd} .= chr $self->{nc};
4814    
4815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4816     $self->{line_prev} = $self->{line};
4817     $self->{column_prev} = $self->{column};
4818     $self->{column}++;
4819     $self->{nc}
4820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4821     } else {
4822     $self->{set_nc}->($self);
4823     }
4824    
4825     redo A;
4826 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4827     $self->{nc} <= 0x0039) { # 0..9
4828    
4829     $self->{state} = NCR_NUM_STATE;
4830 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4831 wakaba 1.1
4832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4833     $self->{line_prev} = $self->{line};
4834     $self->{column_prev} = $self->{column};
4835     $self->{column}++;
4836     $self->{nc}
4837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4838     } else {
4839     $self->{set_nc}->($self);
4840     }
4841    
4842     redo A;
4843     } else {
4844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4845     line => $self->{line_prev},
4846     column => $self->{column_prev} - 1);
4847    
4848     ## NOTE: According to the spec algorithm, nothing is returned,
4849     ## and then "&#" is appended to the parent element or the attribute
4850     ## value in the later processing.
4851    
4852     if ($self->{prev_state} == DATA_STATE) {
4853    
4854     $self->{state} = $self->{prev_state};
4855 wakaba 1.5 $self->{s_kwd} = '';
4856 wakaba 1.1 ## Reconsume.
4857     return ({type => CHARACTER_TOKEN,
4858     data => '&#',
4859     line => $self->{line_prev},
4860     column => $self->{column_prev} - 1,
4861     });
4862     redo A;
4863     } else {
4864    
4865     $self->{ca}->{value} .= '&#';
4866     $self->{state} = $self->{prev_state};
4867 wakaba 1.5 $self->{s_kwd} = '';
4868 wakaba 1.1 ## Reconsume.
4869     redo A;
4870     }
4871     }
4872     } elsif ($self->{state} == NCR_NUM_STATE) {
4873     if (0x0030 <= $self->{nc} and
4874     $self->{nc} <= 0x0039) { # 0..9
4875    
4876 wakaba 1.12 $self->{kwd} *= 10;
4877     $self->{kwd} += $self->{nc} - 0x0030;
4878 wakaba 1.1
4879     ## Stay in the state.
4880    
4881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4882     $self->{line_prev} = $self->{line};
4883     $self->{column_prev} = $self->{column};
4884     $self->{column}++;
4885     $self->{nc}
4886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4887     } else {
4888     $self->{set_nc}->($self);
4889     }
4890    
4891     redo A;
4892     } elsif ($self->{nc} == 0x003B) { # ;
4893    
4894    
4895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4896     $self->{line_prev} = $self->{line};
4897     $self->{column_prev} = $self->{column};
4898     $self->{column}++;
4899     $self->{nc}
4900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4901     } else {
4902     $self->{set_nc}->($self);
4903     }
4904    
4905     #
4906     } else {
4907    
4908     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4909     ## Reconsume.
4910     #
4911     }
4912    
4913 wakaba 1.12 my $code = $self->{kwd};
4914 wakaba 1.1 my $l = $self->{line_prev};
4915     my $c = $self->{column_prev};
4916 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4917     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4918     ($self->{is_xml} and $code == 0x0000)) {
4919 wakaba 1.1
4920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4921     text => (sprintf 'U+%04X', $code),
4922     line => $l, column => $c);
4923     $code = $charref_map->{$code};
4924     } elsif ($code > 0x10FFFF) {
4925    
4926     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4927     text => (sprintf 'U-%08X', $code),
4928     line => $l, column => $c);
4929     $code = 0xFFFD;
4930     }
4931    
4932     if ($self->{prev_state} == DATA_STATE) {
4933    
4934     $self->{state} = $self->{prev_state};
4935 wakaba 1.5 $self->{s_kwd} = '';
4936 wakaba 1.1 ## Reconsume.
4937     return ({type => CHARACTER_TOKEN, data => chr $code,
4938 wakaba 1.7 has_reference => 1,
4939 wakaba 1.1 line => $l, column => $c,
4940     });
4941     redo A;
4942     } else {
4943    
4944     $self->{ca}->{value} .= chr $code;
4945     $self->{ca}->{has_reference} = 1;
4946     $self->{state} = $self->{prev_state};
4947 wakaba 1.5 $self->{s_kwd} = '';
4948 wakaba 1.1 ## Reconsume.
4949     redo A;
4950     }
4951     } elsif ($self->{state} == HEXREF_X_STATE) {
4952     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4953     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4954     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4955     # 0..9, A..F, a..f
4956    
4957     $self->{state} = HEXREF_HEX_STATE;
4958 wakaba 1.12 $self->{kwd} = 0;
4959 wakaba 1.1 ## Reconsume.
4960     redo A;
4961     } else {
4962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4963     line => $self->{line_prev},
4964     column => $self->{column_prev} - 2);
4965    
4966     ## NOTE: According to the spec algorithm, nothing is returned,
4967     ## and then "&#" followed by "X" or "x" is appended to the parent
4968     ## element or the attribute value in the later processing.
4969    
4970     if ($self->{prev_state} == DATA_STATE) {
4971    
4972     $self->{state} = $self->{prev_state};
4973 wakaba 1.5 $self->{s_kwd} = '';
4974 wakaba 1.1 ## Reconsume.
4975     return ({type => CHARACTER_TOKEN,
4976 wakaba 1.12 data => '&' . $self->{kwd},
4977 wakaba 1.1 line => $self->{line_prev},
4978 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4979 wakaba 1.1 });
4980     redo A;
4981     } else {
4982    
4983 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4984 wakaba 1.1 $self->{state} = $self->{prev_state};
4985 wakaba 1.5 $self->{s_kwd} = '';
4986 wakaba 1.1 ## Reconsume.
4987     redo A;
4988     }
4989     }
4990     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4991     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4992     # 0..9
4993    
4994 wakaba 1.12 $self->{kwd} *= 0x10;
4995     $self->{kwd} += $self->{nc} - 0x0030;
4996 wakaba 1.1 ## Stay in the state.
4997    
4998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999     $self->{line_prev} = $self->{line};
5000     $self->{column_prev} = $self->{column};
5001     $self->{column}++;
5002     $self->{nc}
5003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5004     } else {
5005     $self->{set_nc}->($self);
5006     }
5007    
5008     redo A;
5009     } elsif (0x0061 <= $self->{nc} and
5010     $self->{nc} <= 0x0066) { # a..f
5011    
5012 wakaba 1.12 $self->{kwd} *= 0x10;
5013     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5014 wakaba 1.1 ## Stay in the state.
5015    
5016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5017     $self->{line_prev} = $self->{line};
5018     $self->{column_prev} = $self->{column};
5019     $self->{column}++;
5020     $self->{nc}
5021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5022     } else {
5023     $self->{set_nc}->($self);
5024     }
5025    
5026     redo A;
5027     } elsif (0x0041 <= $self->{nc} and
5028     $self->{nc} <= 0x0046) { # A..F
5029    
5030 wakaba 1.12 $self->{kwd} *= 0x10;
5031     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5032 wakaba 1.1 ## Stay in the state.
5033    
5034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035     $self->{line_prev} = $self->{line};
5036     $self->{column_prev} = $self->{column};
5037     $self->{column}++;
5038     $self->{nc}
5039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040     } else {
5041     $self->{set_nc}->($self);
5042     }
5043    
5044     redo A;
5045     } elsif ($self->{nc} == 0x003B) { # ;
5046    
5047    
5048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5049     $self->{line_prev} = $self->{line};
5050     $self->{column_prev} = $self->{column};
5051     $self->{column}++;
5052     $self->{nc}
5053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5054     } else {
5055     $self->{set_nc}->($self);
5056     }
5057    
5058     #
5059     } else {
5060    
5061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5062     line => $self->{line},
5063     column => $self->{column});
5064     ## Reconsume.
5065     #
5066     }
5067    
5068 wakaba 1.12 my $code = $self->{kwd};
5069 wakaba 1.1 my $l = $self->{line_prev};
5070     my $c = $self->{column_prev};
5071 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5072     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5073     ($self->{is_xml} and $code == 0x0000)) {
5074 wakaba 1.1
5075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5076     text => (sprintf 'U+%04X', $code),
5077     line => $l, column => $c);
5078     $code = $charref_map->{$code};
5079     } elsif ($code > 0x10FFFF) {
5080    
5081     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5082     text => (sprintf 'U-%08X', $code),
5083     line => $l, column => $c);
5084     $code = 0xFFFD;
5085     }
5086    
5087     if ($self->{prev_state} == DATA_STATE) {
5088    
5089     $self->{state} = $self->{prev_state};
5090 wakaba 1.5 $self->{s_kwd} = '';
5091 wakaba 1.1 ## Reconsume.
5092     return ({type => CHARACTER_TOKEN, data => chr $code,
5093 wakaba 1.7 has_reference => 1,
5094 wakaba 1.1 line => $l, column => $c,
5095     });
5096     redo A;
5097     } else {
5098    
5099     $self->{ca}->{value} .= chr $code;
5100     $self->{ca}->{has_reference} = 1;
5101     $self->{state} = $self->{prev_state};
5102 wakaba 1.5 $self->{s_kwd} = '';
5103 wakaba 1.1 ## Reconsume.
5104     redo A;
5105     }
5106     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5107 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5108     $self->{nc} <= 0x005A) or # x
5109     (0x0061 <= $self->{nc} and # a
5110     $self->{nc} <= 0x007A) or # z
5111     (0x0030 <= $self->{nc} and # 0
5112     $self->{nc} <= 0x0039) or # 9
5113 wakaba 1.22 $self->{nc} == 0x003B or # ;
5114     ($self->{is_xml} and
5115     not ($is_space->{$self->{nc}} or
5116     {
5117     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5118     $self->{entity_add} => 1,
5119     }->{$self->{nc}}))) {
5120 wakaba 1.1 our $EntityChar;
5121 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5122 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5123     $self->{ge}->{$self->{kwd}}) {
5124 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5125 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5126     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5127    
5128     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5129     } else {
5130     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5131    
5132     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5133     value => $self->{kwd});
5134     } else {
5135    
5136     }
5137     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5138     }
5139     } else {
5140     if ($self->{is_xml}) {
5141    
5142     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5143     value => $self->{kwd},
5144     level => {
5145     'amp;' => $self->{level}->{warn},
5146     'quot;' => $self->{level}->{warn},
5147     'lt;' => $self->{level}->{warn},
5148     'gt;' => $self->{level}->{warn},
5149     'apos;' => $self->{level}->{warn},
5150     }->{$self->{kwd}} ||
5151     $self->{level}->{must});
5152     } else {
5153    
5154     }
5155     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5156     }
5157 wakaba 1.1 $self->{entity__match} = 1;
5158    
5159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5160     $self->{line_prev} = $self->{line};
5161     $self->{column_prev} = $self->{column};
5162     $self->{column}++;
5163     $self->{nc}
5164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5165     } else {
5166     $self->{set_nc}->($self);
5167     }
5168    
5169     #
5170     } else {
5171    
5172 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5173 wakaba 1.1 $self->{entity__match} = -1;
5174     ## Stay in the state.
5175    
5176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5177     $self->{line_prev} = $self->{line};
5178     $self->{column_prev} = $self->{column};
5179     $self->{column}++;
5180     $self->{nc}
5181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5182     } else {
5183     $self->{set_nc}->($self);
5184     }
5185    
5186     redo A;
5187     }
5188     } else {
5189    
5190     $self->{entity__value} .= chr $self->{nc};
5191     $self->{entity__match} *= 2;
5192     ## Stay in the state.
5193    
5194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5195     $self->{line_prev} = $self->{line};
5196     $self->{column_prev} = $self->{column};
5197     $self->{column}++;
5198     $self->{nc}
5199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5200     } else {
5201     $self->{set_nc}->($self);
5202     }
5203    
5204     redo A;
5205     }
5206     }
5207    
5208     my $data;
5209     my $has_ref;
5210     if ($self->{entity__match} > 0) {
5211    
5212     $data = $self->{entity__value};
5213     $has_ref = 1;
5214     #
5215     } elsif ($self->{entity__match} < 0) {
5216     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5217     if ($self->{prev_state} != DATA_STATE and # in attribute
5218     $self->{entity__match} < -1) {
5219    
5220 wakaba 1.12 $data = '&' . $self->{kwd};
5221 wakaba 1.1 #
5222     } else {
5223    
5224     $data = $self->{entity__value};
5225     $has_ref = 1;
5226     #
5227     }
5228     } else {
5229    
5230     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5231     line => $self->{line_prev},
5232 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5233     $data = '&' . $self->{kwd};
5234 wakaba 1.1 #
5235     }
5236    
5237     ## NOTE: In these cases, when a character reference is found,
5238     ## it is consumed and a character token is returned, or, otherwise,
5239     ## nothing is consumed and returned, according to the spec algorithm.
5240     ## In this implementation, anything that has been examined by the
5241     ## tokenizer is appended to the parent element or the attribute value
5242     ## as string, either literal string when no character reference or
5243     ## entity-replaced string otherwise, in this stage, since any characters
5244     ## that would not be consumed are appended in the data state or in an
5245     ## appropriate attribute value state anyway.
5246    
5247     if ($self->{prev_state} == DATA_STATE) {
5248    
5249     $self->{state} = $self->{prev_state};
5250 wakaba 1.5 $self->{s_kwd} = '';
5251 wakaba 1.1 ## Reconsume.
5252     return ({type => CHARACTER_TOKEN,
5253     data => $data,
5254 wakaba 1.7 has_reference => $has_ref,
5255 wakaba 1.1 line => $self->{line_prev},
5256 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5257 wakaba 1.1 });
5258     redo A;
5259     } else {
5260    
5261     $self->{ca}->{value} .= $data;
5262     $self->{ca}->{has_reference} = 1 if $has_ref;
5263     $self->{state} = $self->{prev_state};
5264 wakaba 1.5 $self->{s_kwd} = '';
5265 wakaba 1.1 ## Reconsume.
5266     redo A;
5267     }
5268 wakaba 1.8
5269     ## XML-only states
5270    
5271     } elsif ($self->{state} == PI_STATE) {
5272 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5273    
5274 wakaba 1.8 if ($is_space->{$self->{nc}} or
5275 wakaba 1.14 $self->{nc} == 0x003F or # ?
5276 wakaba 1.8 $self->{nc} == -1) {
5277 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5278     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5279     ## "DOCTYPE pi state": Parse error, switch to the "data
5280     ## state".
5281 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5282     line => $self->{line_prev},
5283     column => $self->{column_prev}
5284     - 1 * ($self->{nc} != -1));
5285     $self->{state} = BOGUS_COMMENT_STATE;
5286     ## Reconsume.
5287     $self->{ct} = {type => COMMENT_TOKEN,
5288     data => '?',
5289     line => $self->{line_prev},
5290     column => $self->{column_prev}
5291     - 1 * ($self->{nc} != -1),
5292     };
5293     redo A;
5294     } else {
5295 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5296 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5297     target => chr $self->{nc},
5298     data => '',
5299     line => $self->{line_prev},
5300     column => $self->{column_prev} - 1,
5301     };
5302     $self->{state} = PI_TARGET_STATE;
5303    
5304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5305     $self->{line_prev} = $self->{line};
5306     $self->{column_prev} = $self->{column};
5307     $self->{column}++;
5308     $self->{nc}
5309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5310     } else {
5311     $self->{set_nc}->($self);
5312     }
5313    
5314     redo A;
5315     }
5316     } elsif ($self->{state} == PI_TARGET_STATE) {
5317     if ($is_space->{$self->{nc}}) {
5318     $self->{state} = PI_TARGET_AFTER_STATE;
5319    
5320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5321     $self->{line_prev} = $self->{line};
5322     $self->{column_prev} = $self->{column};
5323     $self->{column}++;
5324     $self->{nc}
5325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5326     } else {
5327     $self->{set_nc}->($self);
5328     }
5329    
5330     redo A;
5331     } elsif ($self->{nc} == -1) {
5332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5333 wakaba 1.13 if ($self->{in_subset}) {
5334     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5335     } else {
5336     $self->{state} = DATA_STATE;
5337     $self->{s_kwd} = '';
5338     }
5339 wakaba 1.8 ## Reconsume.
5340     return ($self->{ct}); # pi
5341     redo A;
5342     } elsif ($self->{nc} == 0x003F) { # ?
5343     $self->{state} = PI_AFTER_STATE;
5344    
5345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5346     $self->{line_prev} = $self->{line};
5347     $self->{column_prev} = $self->{column};
5348     $self->{column}++;
5349     $self->{nc}
5350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5351     } else {
5352     $self->{set_nc}->($self);
5353     }
5354    
5355     redo A;
5356     } else {
5357     ## XML5: typo ("tag name" -> "target")
5358     $self->{ct}->{target} .= chr $self->{nc}; # pi
5359    
5360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5361     $self->{line_prev} = $self->{line};
5362     $self->{column_prev} = $self->{column};
5363     $self->{column}++;
5364     $self->{nc}
5365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5366     } else {
5367     $self->{set_nc}->($self);
5368     }
5369    
5370     redo A;
5371     }
5372     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5373     if ($is_space->{$self->{nc}}) {
5374     ## Stay in the state.
5375    
5376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5377     $self->{line_prev} = $self->{line};
5378     $self->{column_prev} = $self->{column};
5379     $self->{column}++;
5380     $self->{nc}
5381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5382     } else {
5383     $self->{set_nc}->($self);
5384     }
5385    
5386     redo A;
5387     } else {
5388     $self->{state} = PI_DATA_STATE;
5389     ## Reprocess.
5390     redo A;
5391     }
5392     } elsif ($self->{state} == PI_DATA_STATE) {
5393     if ($self->{nc} == 0x003F) { # ?
5394     $self->{state} = PI_DATA_AFTER_STATE;
5395    
5396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5397     $self->{line_prev} = $self->{line};
5398     $self->{column_prev} = $self->{column};
5399     $self->{column}++;
5400     $self->{nc}
5401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5402     } else {
5403     $self->{set_nc}->($self);
5404     }
5405    
5406     redo A;
5407     } elsif ($self->{nc} == -1) {
5408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5409 wakaba 1.13 if ($self->{in_subset}) {
5410 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5411 wakaba 1.13 } else {
5412     $self->{state} = DATA_STATE;
5413     $self->{s_kwd} = '';
5414     }
5415 wakaba 1.8 ## Reprocess.
5416     return ($self->{ct}); # pi
5417     redo A;
5418     } else {
5419     $self->{ct}->{data} .= chr $self->{nc}; # pi
5420     $self->{read_until}->($self->{ct}->{data}, q[?],
5421     length $self->{ct}->{data});
5422     ## Stay in the state.
5423    
5424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5425     $self->{line_prev} = $self->{line};
5426     $self->{column_prev} = $self->{column};
5427     $self->{column}++;
5428     $self->{nc}
5429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5430     } else {
5431     $self->{set_nc}->($self);
5432     }
5433    
5434     ## Reprocess.
5435     redo A;
5436     }
5437     } elsif ($self->{state} == PI_AFTER_STATE) {
5438 wakaba 1.14 ## XML5: Part of "Pi after state".
5439    
5440 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5441 wakaba 1.13 if ($self->{in_subset}) {
5442     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5443     } else {
5444     $self->{state} = DATA_STATE;
5445     $self->{s_kwd} = '';
5446     }
5447 wakaba 1.8
5448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5449     $self->{line_prev} = $self->{line};
5450     $self->{column_prev} = $self->{column};
5451     $self->{column}++;
5452     $self->{nc}
5453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5454     } else {
5455     $self->{set_nc}->($self);
5456     }
5457    
5458     return ($self->{ct}); # pi
5459     redo A;
5460     } elsif ($self->{nc} == 0x003F) { # ?
5461     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5462     line => $self->{line_prev},
5463     column => $self->{column_prev}); ## XML5: no error
5464     $self->{ct}->{data} .= '?';
5465     $self->{state} = PI_DATA_AFTER_STATE;
5466    
5467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5468     $self->{line_prev} = $self->{line};
5469     $self->{column_prev} = $self->{column};
5470     $self->{column}++;
5471     $self->{nc}
5472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5473     } else {
5474     $self->{set_nc}->($self);
5475     }
5476    
5477     redo A;
5478     } else {
5479     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5480     line => $self->{line_prev},
5481     column => $self->{column_prev}
5482     + 1 * ($self->{nc} == -1)); ## XML5: no error
5483     $self->{ct}->{data} .= '?'; ## XML5: not appended
5484     $self->{state} = PI_DATA_STATE;
5485     ## Reprocess.
5486     redo A;
5487     }
5488     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5489 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5490    
5491 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5492 wakaba 1.13 if ($self->{in_subset}) {
5493     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5494     } else {
5495     $self->{state} = DATA_STATE;
5496     $self->{s_kwd} = '';
5497     }
5498 wakaba 1.8
5499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5500     $self->{line_prev} = $self->{line};
5501     $self->{column_prev} = $self->{column};
5502     $self->{column}++;
5503     $self->{nc}
5504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5505     } else {
5506     $self->{set_nc}->($self);
5507     }
5508    
5509     return ($self->{ct}); # pi
5510     redo A;
5511     } elsif ($self->{nc} == 0x003F) { # ?
5512     $self->{ct}->{data} .= '?';
5513     ## Stay in the state.
5514    
5515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5516     $self->{line_prev} = $self->{line};
5517     $self->{column_prev} = $self->{column};
5518     $self->{column}++;
5519     $self->{nc}
5520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5521     } else {
5522     $self->{set_nc}->($self);
5523     }
5524    
5525     redo A;
5526     } else {
5527     $self->{ct}->{data} .= '?'; ## XML5: not appended
5528     $self->{state} = PI_DATA_STATE;
5529     ## Reprocess.
5530     redo A;
5531     }
5532 wakaba 1.12
5533     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5534     if ($self->{nc} == 0x003C) { # <
5535 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5536 wakaba 1.12
5537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5538     $self->{line_prev} = $self->{line};
5539     $self->{column_prev} = $self->{column};
5540     $self->{column}++;
5541     $self->{nc}
5542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5543     } else {
5544     $self->{set_nc}->($self);
5545     }
5546    
5547     redo A;
5548     } elsif ($self->{nc} == 0x0025) { # %
5549     ## XML5: Not defined yet.
5550    
5551     ## TODO:
5552 wakaba 1.24
5553     if (not $self->{stop_processing} and
5554     not $self->{document}->xml_standalone) {
5555     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5556     level => $self->{level}->{info});
5557     $self->{stop_processing} = 1;
5558     }
5559    
5560 wakaba 1.12
5561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5562     $self->{line_prev} = $self->{line};
5563     $self->{column_prev} = $self->{column};
5564     $self->{column}++;
5565     $self->{nc}
5566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5567     } else {
5568     $self->{set_nc}->($self);
5569     }
5570    
5571     redo A;
5572     } elsif ($self->{nc} == 0x005D) { # ]
5573 wakaba 1.13 delete $self->{in_subset};
5574 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5575    
5576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577     $self->{line_prev} = $self->{line};
5578     $self->{column_prev} = $self->{column};
5579     $self->{column}++;
5580     $self->{nc}
5581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582     } else {
5583     $self->{set_nc}->($self);
5584     }
5585    
5586     redo A;
5587     } elsif ($is_space->{$self->{nc}}) {
5588     ## Stay in the state.
5589    
5590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591     $self->{line_prev} = $self->{line};
5592     $self->{column_prev} = $self->{column};
5593     $self->{column}++;
5594     $self->{nc}
5595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596     } else {
5597     $self->{set_nc}->($self);
5598     }
5599    
5600     redo A;
5601     } elsif ($self->{nc} == -1) {
5602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5603 wakaba 1.13 delete $self->{in_subset};
5604 wakaba 1.12 $self->{state} = DATA_STATE;
5605     $self->{s_kwd} = '';
5606     ## Reconsume.
5607 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5608 wakaba 1.12 redo A;
5609     } else {
5610     unless ($self->{internal_subset_tainted}) {
5611     ## XML5: No parse error.
5612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5613     $self->{internal_subset_tainted} = 1;
5614     }
5615     ## Stay in the state.
5616    
5617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618     $self->{line_prev} = $self->{line};
5619     $self->{column_prev} = $self->{column};
5620     $self->{column}++;
5621     $self->{nc}
5622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623     } else {
5624     $self->{set_nc}->($self);
5625     }
5626    
5627     redo A;
5628     }
5629     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5630     if ($self->{nc} == 0x003E) { # >
5631     $self->{state} = DATA_STATE;
5632     $self->{s_kwd} = '';
5633    
5634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635     $self->{line_prev} = $self->{line};
5636     $self->{column_prev} = $self->{column};
5637     $self->{column}++;
5638     $self->{nc}
5639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640     } else {
5641     $self->{set_nc}->($self);
5642     }
5643    
5644 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5645 wakaba 1.12 redo A;
5646     } elsif ($self->{nc} == -1) {
5647     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5648     $self->{state} = DATA_STATE;
5649     $self->{s_kwd} = '';
5650     ## Reconsume.
5651 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5652 wakaba 1.12 redo A;
5653     } else {
5654     ## XML5: No parse error and stay in the state.
5655     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5656    
5657 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5658    
5659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5660     $self->{line_prev} = $self->{line};
5661     $self->{column_prev} = $self->{column};
5662     $self->{column}++;
5663     $self->{nc}
5664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5665     } else {
5666     $self->{set_nc}->($self);
5667     }
5668    
5669     redo A;
5670     }
5671     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5672     if ($self->{nc} == 0x003E) { # >
5673     $self->{state} = DATA_STATE;
5674     $self->{s_kwd} = '';
5675    
5676     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5677     $self->{line_prev} = $self->{line};
5678     $self->{column_prev} = $self->{column};
5679     $self->{column}++;
5680     $self->{nc}
5681     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5682     } else {
5683     $self->{set_nc}->($self);
5684     }
5685    
5686     return ({type => END_OF_DOCTYPE_TOKEN});
5687     redo A;
5688     } elsif ($self->{nc} == -1) {
5689     $self->{state} = DATA_STATE;
5690     $self->{s_kwd} = '';
5691     ## Reconsume.
5692     return ({type => END_OF_DOCTYPE_TOKEN});
5693     redo A;
5694     } else {
5695     ## Stay in the state.
5696    
5697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5698     $self->{line_prev} = $self->{line};
5699     $self->{column_prev} = $self->{column};
5700     $self->{column}++;
5701     $self->{nc}
5702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5703     } else {
5704     $self->{set_nc}->($self);
5705     }
5706    
5707     redo A;
5708     }
5709     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5710     if ($self->{nc} == 0x0021) { # !
5711 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5712 wakaba 1.13
5713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5714     $self->{line_prev} = $self->{line};
5715     $self->{column_prev} = $self->{column};
5716     $self->{column}++;
5717     $self->{nc}
5718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5719     } else {
5720     $self->{set_nc}->($self);
5721     }
5722    
5723     redo A;
5724     } elsif ($self->{nc} == 0x003F) { # ?
5725     $self->{state} = PI_STATE;
5726    
5727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5728     $self->{line_prev} = $self->{line};
5729     $self->{column_prev} = $self->{column};
5730     $self->{column}++;
5731     $self->{nc}
5732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5733     } else {
5734     $self->{set_nc}->($self);
5735     }
5736    
5737     redo A;
5738     } elsif ($self->{nc} == -1) {
5739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5740     $self->{state} = DATA_STATE;
5741     $self->{s_kwd} = '';
5742     ## Reconsume.
5743     redo A;
5744     } else {
5745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5746     line => $self->{line_prev},
5747     column => $self->{column_prev});
5748     $self->{state} = BOGUS_COMMENT_STATE;
5749     $self->{ct} = {type => COMMENT_TOKEN,
5750     data => '',
5751     }; ## NOTE: Will be discarded.
5752 wakaba 1.12
5753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5754     $self->{line_prev} = $self->{line};
5755     $self->{column_prev} = $self->{column};
5756     $self->{column}++;
5757     $self->{nc}
5758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5759     } else {
5760     $self->{set_nc}->($self);
5761     }
5762    
5763     redo A;
5764     }
5765 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5766     ## XML5: "DOCTYPE markup declaration state".
5767    
5768     if ($self->{nc} == 0x002D) { # -
5769     $self->{state} = MD_HYPHEN_STATE;
5770    
5771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5772     $self->{line_prev} = $self->{line};
5773     $self->{column_prev} = $self->{column};
5774     $self->{column}++;
5775     $self->{nc}
5776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5777     } else {
5778     $self->{set_nc}->($self);
5779     }
5780    
5781     redo A;
5782 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5783     $self->{nc} == 0x0065) { # e
5784 wakaba 1.14 $self->{state} = MD_E_STATE;
5785     $self->{kwd} = chr $self->{nc};
5786    
5787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5788     $self->{line_prev} = $self->{line};
5789     $self->{column_prev} = $self->{column};
5790     $self->{column}++;
5791     $self->{nc}
5792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5793     } else {
5794     $self->{set_nc}->($self);
5795     }
5796    
5797     redo A;
5798 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5799     $self->{nc} == 0x0061) { # a
5800 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5801     $self->{kwd} = chr $self->{nc};
5802    
5803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5804     $self->{line_prev} = $self->{line};
5805     $self->{column_prev} = $self->{column};
5806     $self->{column}++;
5807     $self->{nc}
5808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5809     } else {
5810     $self->{set_nc}->($self);
5811     }
5812    
5813     redo A;
5814 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5815     $self->{nc} == 0x006E) { # n
5816 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5817     $self->{kwd} = chr $self->{nc};
5818    
5819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5820     $self->{line_prev} = $self->{line};
5821     $self->{column_prev} = $self->{column};
5822     $self->{column}++;
5823     $self->{nc}
5824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5825     } else {
5826     $self->{set_nc}->($self);
5827     }
5828    
5829     redo A;
5830     } else {
5831     #
5832     }
5833    
5834     ## XML5: No parse error.
5835     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5836     line => $self->{line_prev},
5837     column => $self->{column_prev} - 1);
5838     ## Reconsume.
5839     $self->{state} = BOGUS_COMMENT_STATE;
5840     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5841     redo A;
5842     } elsif ($self->{state} == MD_E_STATE) {
5843 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5844     $self->{nc} == 0x006E) { # n
5845 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5846     $self->{kwd} .= chr $self->{nc};
5847    
5848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5849     $self->{line_prev} = $self->{line};
5850     $self->{column_prev} = $self->{column};
5851     $self->{column}++;
5852     $self->{nc}
5853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5854     } else {
5855     $self->{set_nc}->($self);
5856     }
5857    
5858     redo A;
5859 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5860     $self->{nc} == 0x006C) { # l
5861 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5862     $self->{state} = MD_ELEMENT_STATE;
5863     $self->{kwd} .= chr $self->{nc};
5864    
5865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5866     $self->{line_prev} = $self->{line};
5867     $self->{column_prev} = $self->{column};
5868     $self->{column}++;
5869     $self->{nc}
5870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5871     } else {
5872     $self->{set_nc}->($self);
5873     }
5874    
5875     redo A;
5876     } else {
5877     ## XML5: No parse error.
5878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5879     line => $self->{line_prev},
5880     column => $self->{column_prev} - 2
5881     + 1 * ($self->{nc} == -1));
5882     ## Reconsume.
5883     $self->{state} = BOGUS_COMMENT_STATE;
5884     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5885     redo A;
5886     }
5887     } elsif ($self->{state} == MD_ENTITY_STATE) {
5888 wakaba 1.17 if ($self->{nc} == [
5889     undef,
5890     undef,
5891     0x0054, # T
5892     0x0049, # I
5893     0x0054, # T
5894     ]->[length $self->{kwd}] or
5895     $self->{nc} == [
5896     undef,
5897     undef,
5898     0x0074, # t
5899     0x0069, # i
5900     0x0074, # t
5901     ]->[length $self->{kwd}]) {
5902 wakaba 1.14 ## Stay in the state.
5903     $self->{kwd} .= chr $self->{nc};
5904    
5905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5906     $self->{line_prev} = $self->{line};
5907     $self->{column_prev} = $self->{column};
5908     $self->{column}++;
5909     $self->{nc}
5910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5911     } else {
5912     $self->{set_nc}->($self);
5913     }
5914    
5915     redo A;
5916 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5917     ($self->{nc} == 0x0059 or # Y
5918     $self->{nc} == 0x0079)) { # y
5919     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5921     text => 'ENTITY',
5922     line => $self->{line_prev},
5923     column => $self->{column_prev} - 4);
5924     }
5925     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5926 wakaba 1.14 line => $self->{line_prev},
5927     column => $self->{column_prev} - 6};
5928     $self->{state} = DOCTYPE_MD_STATE;
5929    
5930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5931     $self->{line_prev} = $self->{line};
5932     $self->{column_prev} = $self->{column};
5933     $self->{column}++;
5934     $self->{nc}
5935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5936     } else {
5937     $self->{set_nc}->($self);
5938     }
5939    
5940     redo A;
5941     } else {
5942     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5943     line => $self->{line_prev},
5944     column => $self->{column_prev} - 1
5945     - (length $self->{kwd})
5946     + 1 * ($self->{nc} == -1));
5947     $self->{state} = BOGUS_COMMENT_STATE;
5948     ## Reconsume.
5949     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5950     redo A;
5951     }
5952     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5953 wakaba 1.17 if ($self->{nc} == [
5954     undef,
5955     undef,
5956     0x0045, # E
5957     0x004D, # M
5958     0x0045, # E
5959     0x004E, # N
5960     ]->[length $self->{kwd}] or
5961     $self->{nc} == [
5962     undef,
5963     undef,
5964     0x0065, # e
5965     0x006D, # m
5966     0x0065, # e
5967     0x006E, # n
5968     ]->[length $self->{kwd}]) {
5969 wakaba 1.14 ## Stay in the state.
5970     $self->{kwd} .= chr $self->{nc};
5971    
5972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5973     $self->{line_prev} = $self->{line};
5974     $self->{column_prev} = $self->{column};
5975     $self->{column}++;
5976     $self->{nc}
5977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5978     } else {
5979     $self->{set_nc}->($self);
5980     }
5981    
5982     redo A;
5983 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5984     ($self->{nc} == 0x0054 or # T
5985     $self->{nc} == 0x0074)) { # t
5986     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5987     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5988     text => 'ELEMENT',
5989     line => $self->{line_prev},
5990     column => $self->{column_prev} - 5);
5991     }
5992 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5993     line => $self->{line_prev},
5994 wakaba 1.23 column => $self->{column_prev} - 7};
5995 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5996    
5997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5998     $self->{line_prev} = $self->{line};
5999     $self->{column_prev} = $self->{column};
6000     $self->{column}++;
6001     $self->{nc}
6002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6003     } else {
6004     $self->{set_nc}->($self);
6005     }
6006    
6007     redo A;
6008     } else {
6009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6010     line => $self->{line_prev},
6011     column => $self->{column_prev} - 1
6012     - (length $self->{kwd})
6013     + 1 * ($self->{nc} == -1));
6014     $self->{state} = BOGUS_COMMENT_STATE;
6015     ## Reconsume.
6016     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6017     redo A;
6018     }
6019     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6020 wakaba 1.17 if ($self->{nc} == [
6021     undef,
6022     0x0054, # T
6023     0x0054, # T
6024     0x004C, # L
6025     0x0049, # I
6026     0x0053, # S
6027     ]->[length $self->{kwd}] or
6028     $self->{nc} == [
6029     undef,
6030     0x0074, # t
6031     0x0074, # t
6032     0x006C, # l
6033     0x0069, # i
6034     0x0073, # s
6035     ]->[length $self->{kwd}]) {
6036 wakaba 1.14 ## Stay in the state.
6037     $self->{kwd} .= chr $self->{nc};
6038    
6039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6040     $self->{line_prev} = $self->{line};
6041     $self->{column_prev} = $self->{column};
6042     $self->{column}++;
6043     $self->{nc}
6044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6045     } else {
6046     $self->{set_nc}->($self);
6047     }
6048    
6049     redo A;
6050 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6051     ($self->{nc} == 0x0054 or # T
6052     $self->{nc} == 0x0074)) { # t
6053     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6055     text => 'ATTLIST',
6056     line => $self->{line_prev},
6057     column => $self->{column_prev} - 5);
6058     }
6059 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6060 wakaba 1.15 attrdefs => [],
6061 wakaba 1.14 line => $self->{line_prev},
6062 wakaba 1.23 column => $self->{column_prev} - 7};
6063 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6064    
6065     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6066     $self->{line_prev} = $self->{line};
6067     $self->{column_prev} = $self->{column};
6068     $self->{column}++;
6069     $self->{nc}
6070     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6071     } else {
6072     $self->{set_nc}->($self);
6073     }
6074    
6075     redo A;
6076     } else {
6077     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6078     line => $self->{line_prev},
6079     column => $self->{column_prev} - 1
6080     - (length $self->{kwd})
6081     + 1 * ($self->{nc} == -1));
6082     $self->{state} = BOGUS_COMMENT_STATE;
6083     ## Reconsume.
6084     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6085     redo A;
6086     }
6087     } elsif ($self->{state} == MD_NOTATION_STATE) {
6088 wakaba 1.17 if ($self->{nc} == [
6089     undef,
6090     0x004F, # O
6091     0x0054, # T
6092     0x0041, # A
6093     0x0054, # T
6094     0x0049, # I
6095     0x004F, # O
6096     ]->[length $self->{kwd}] or
6097     $self->{nc} == [
6098     undef,
6099     0x006F, # o
6100     0x0074, # t
6101     0x0061, # a
6102     0x0074, # t
6103     0x0069, # i
6104     0x006F, # o
6105     ]->[length $self->{kwd}]) {
6106 wakaba 1.14 ## Stay in the state.
6107     $self->{kwd} .= chr $self->{nc};
6108    
6109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6110     $self->{line_prev} = $self->{line};
6111     $self->{column_prev} = $self->{column};
6112     $self->{column}++;
6113     $self->{nc}
6114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6115     } else {
6116     $self->{set_nc}->($self);
6117     }
6118    
6119     redo A;
6120 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6121     ($self->{nc} == 0x004E or # N
6122     $self->{nc} == 0x006E)) { # n
6123     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6125     text => 'NOTATION',
6126     line => $self->{line_prev},
6127     column => $self->{column_prev} - 6);
6128     }
6129 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6130     line => $self->{line_prev},
6131 wakaba 1.23 column => $self->{column_prev} - 8};
6132 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6133    
6134     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6135     $self->{line_prev} = $self->{line};
6136     $self->{column_prev} = $self->{column};
6137     $self->{column}++;
6138     $self->{nc}
6139     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6140     } else {
6141     $self->{set_nc}->($self);
6142     }
6143    
6144     redo A;
6145     } else {
6146     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6147     line => $self->{line_prev},
6148     column => $self->{column_prev} - 1
6149     - (length $self->{kwd})
6150     + 1 * ($self->{nc} == -1));
6151     $self->{state} = BOGUS_COMMENT_STATE;
6152     ## Reconsume.
6153     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6154     redo A;
6155     }
6156     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6157     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6158     ## "DOCTYPE NOTATION state".
6159    
6160     if ($is_space->{$self->{nc}}) {
6161     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6162     $self->{state} = BEFORE_MD_NAME_STATE;
6163    
6164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6165     $self->{line_prev} = $self->{line};
6166     $self->{column_prev} = $self->{column};
6167     $self->{column}++;
6168     $self->{nc}
6169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6170     } else {
6171     $self->{set_nc}->($self);
6172     }
6173    
6174     redo A;
6175     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6176     $self->{nc} == 0x0025) { # %
6177     ## XML5: Switch to the "DOCTYPE bogus comment state".
6178     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6179     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6180    
6181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6182     $self->{line_prev} = $self->{line};
6183     $self->{column_prev} = $self->{column};
6184     $self->{column}++;
6185     $self->{nc}
6186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6187     } else {
6188     $self->{set_nc}->($self);
6189     }
6190    
6191     redo A;
6192     } elsif ($self->{nc} == -1) {
6193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6194     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6195     ## Reconsume.
6196     redo A;
6197     } elsif ($self->{nc} == 0x003E) { # >
6198     ## XML5: Switch to the "DOCTYPE bogus comment state".
6199     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6200     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6201    
6202     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6203     $self->{line_prev} = $self->{line};
6204     $self->{column_prev} = $self->{column};
6205     $self->{column}++;
6206     $self->{nc}
6207     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6208     } else {
6209     $self->{set_nc}->($self);
6210     }
6211    
6212     redo A;
6213     } else {
6214     ## XML5: Switch to the "DOCTYPE bogus comment state".
6215     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6216     $self->{state} = BEFORE_MD_NAME_STATE;
6217     redo A;
6218     }
6219     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6220     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6221     ## before state", "DOCTYPE ATTLIST name before state".
6222    
6223     if ($is_space->{$self->{nc}}) {
6224     ## Stay in the state.
6225    
6226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6227     $self->{line_prev} = $self->{line};
6228     $self->{column_prev} = $self->{column};
6229     $self->{column}++;
6230     $self->{nc}
6231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6232     } else {
6233     $self->{set_nc}->($self);
6234     }
6235    
6236     redo A;
6237     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6238     $self->{nc} == 0x0025) { # %
6239     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6240    
6241     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6242     $self->{line_prev} = $self->{line};
6243     $self->{column_prev} = $self->{column};
6244     $self->{column}++;
6245     $self->{nc}
6246     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6247     } else {
6248     $self->{set_nc}->($self);
6249     }
6250    
6251     redo A;
6252     } elsif ($self->{nc} == 0x003E) { # >
6253     ## XML5: Same as "Anything else".
6254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6256    
6257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6258     $self->{line_prev} = $self->{line};
6259     $self->{column_prev} = $self->{column};
6260     $self->{column}++;
6261     $self->{nc}
6262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6263     } else {
6264     $self->{set_nc}->($self);
6265     }
6266    
6267     redo A;
6268     } elsif ($self->{nc} == -1) {
6269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6271     ## Reconsume.
6272     redo A;
6273     } else {
6274     ## XML5: [ATTLIST] Not defined yet.
6275     $self->{ct}->{name} .= chr $self->{nc};
6276     $self->{state} = MD_NAME_STATE;
6277    
6278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6279     $self->{line_prev} = $self->{line};
6280     $self->{column_prev} = $self->{column};
6281     $self->{column}++;
6282     $self->{nc}
6283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6284     } else {
6285     $self->{set_nc}->($self);
6286     }
6287    
6288     redo A;
6289     }
6290     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6291     if ($is_space->{$self->{nc}}) {
6292     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6293     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6294     $self->{state} = BEFORE_MD_NAME_STATE;
6295 wakaba 1.8
6296 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6297     $self->{line_prev} = $self->{line};
6298     $self->{column_prev} = $self->{column};
6299     $self->{column}++;
6300     $self->{nc}
6301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6302     } else {
6303     $self->{set_nc}->($self);
6304     }
6305    
6306     redo A;
6307     } elsif ($self->{nc} == 0x003E) { # >
6308     ## XML5: Same as "Anything else".
6309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6310     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6311    
6312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313     $self->{line_prev} = $self->{line};
6314     $self->{column_prev} = $self->{column};
6315     $self->{column}++;
6316     $self->{nc}
6317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318     } else {
6319     $self->{set_nc}->($self);
6320     }
6321    
6322     redo A;
6323     } elsif ($self->{nc} == -1) {
6324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6325     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6326     ## Reconsume.
6327     redo A;
6328     } else {
6329     ## XML5: No parse error.
6330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6331     $self->{state} = BOGUS_COMMENT_STATE;
6332     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6333     ## Reconsume.
6334     redo A;
6335     }
6336     } elsif ($self->{state} == MD_NAME_STATE) {
6337     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6338    
6339     if ($is_space->{$self->{nc}}) {
6340 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6341     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6342     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6343 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6344 wakaba 1.16 } else { # ENTITY/NOTATION
6345     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6346     }
6347 wakaba 1.14
6348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349     $self->{line_prev} = $self->{line};
6350     $self->{column_prev} = $self->{column};
6351     $self->{column}++;
6352     $self->{nc}
6353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354     } else {
6355     $self->{set_nc}->($self);
6356     }
6357    
6358     redo A;
6359     } elsif ($self->{nc} == 0x003E) { # >
6360     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6361     #
6362     } else {
6363 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6364 wakaba 1.14 }
6365     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6366    
6367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6368     $self->{line_prev} = $self->{line};
6369     $self->{column_prev} = $self->{column};
6370     $self->{column}++;
6371     $self->{nc}
6372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6373     } else {
6374     $self->{set_nc}->($self);
6375     }
6376    
6377     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6378     redo A;
6379     } elsif ($self->{nc} == -1) {
6380     ## XML5: [ATTLIST] No parse error.
6381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6383     ## Reconsume.
6384     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6385     redo A;
6386     } else {
6387     ## XML5: [ATTLIST] Not defined yet.
6388     $self->{ct}->{name} .= chr $self->{nc};
6389     ## Stay in the state.
6390    
6391     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6392     $self->{line_prev} = $self->{line};
6393     $self->{column_prev} = $self->{column};
6394     $self->{column}++;
6395     $self->{nc}
6396     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6397     } else {
6398     $self->{set_nc}->($self);
6399     }
6400    
6401     redo A;
6402     }
6403     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6404     if ($is_space->{$self->{nc}}) {
6405     ## Stay in the state.
6406    
6407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408     $self->{line_prev} = $self->{line};
6409     $self->{column_prev} = $self->{column};
6410     $self->{column}++;
6411     $self->{nc}
6412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413     } else {
6414     $self->{set_nc}->($self);
6415     }
6416    
6417     redo A;
6418     } elsif ($self->{nc} == 0x003E) { # >
6419     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6420    
6421     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6422     $self->{line_prev} = $self->{line};
6423     $self->{column_prev} = $self->{column};
6424     $self->{column}++;
6425     $self->{nc}
6426     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6427     } else {
6428     $self->{set_nc}->($self);
6429     }
6430    
6431     return ($self->{ct}); # ATTLIST
6432     redo A;
6433     } elsif ($self->{nc} == -1) {
6434     ## XML5: No parse error.
6435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6436     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6437 wakaba 1.15 return ($self->{ct});
6438 wakaba 1.14 redo A;
6439     } else {
6440     ## XML5: Not defined yet.
6441 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6442     tokens => [],
6443     line => $self->{line}, column => $self->{column}};
6444     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6445    
6446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6447     $self->{line_prev} = $self->{line};
6448     $self->{column_prev} = $self->{column};
6449     $self->{column}++;
6450     $self->{nc}
6451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6452     } else {
6453     $self->{set_nc}->($self);
6454     }
6455    
6456     redo A;
6457     }
6458     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6459     if ($is_space->{$self->{nc}}) {
6460     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6461    
6462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6463     $self->{line_prev} = $self->{line};
6464     $self->{column_prev} = $self->{column};
6465     $self->{column}++;
6466     $self->{nc}
6467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6468     } else {
6469     $self->{set_nc}->($self);
6470     }
6471    
6472     redo A;
6473     } elsif ($self->{nc} == 0x003E) { # >
6474     ## XML5: Same as "anything else".
6475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6477    
6478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6479     $self->{line_prev} = $self->{line};
6480     $self->{column_prev} = $self->{column};
6481     $self->{column}++;
6482     $self->{nc}
6483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6484     } else {
6485     $self->{set_nc}->($self);
6486     }
6487    
6488     return ($self->{ct}); # ATTLIST
6489     redo A;
6490     } elsif ($self->{nc} == 0x0028) { # (
6491     ## XML5: Same as "anything else".
6492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6493     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6494    
6495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6496     $self->{line_prev} = $self->{line};
6497     $self->{column_prev} = $self->{column};
6498     $self->{column}++;
6499     $self->{nc}
6500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6501     } else {
6502     $self->{set_nc}->($self);
6503     }
6504    
6505     redo A;
6506     } elsif ($self->{nc} == -1) {
6507     ## XML5: No parse error.
6508     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6509     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6510    
6511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6512     $self->{line_prev} = $self->{line};
6513     $self->{column_prev} = $self->{column};
6514     $self->{column}++;
6515     $self->{nc}
6516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6517     } else {
6518     $self->{set_nc}->($self);
6519     }
6520    
6521     return ($self->{ct}); # ATTLIST
6522     redo A;
6523     } else {
6524     ## XML5: Not defined yet.
6525     $self->{ca}->{name} .= chr $self->{nc};
6526     ## Stay in the state.
6527    
6528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6529     $self->{line_prev} = $self->{line};
6530     $self->{column_prev} = $self->{column};
6531     $self->{column}++;
6532     $self->{nc}
6533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6534     } else {
6535     $self->{set_nc}->($self);
6536     }
6537    
6538 wakaba 1.14 redo A;
6539     }
6540 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6541     if ($is_space->{$self->{nc}}) {
6542     ## Stay in the state.
6543    
6544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6545     $self->{line_prev} = $self->{line};
6546     $self->{column_prev} = $self->{column};
6547     $self->{column}++;
6548     $self->{nc}
6549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6550     } else {
6551     $self->{set_nc}->($self);
6552     }
6553    
6554     redo A;
6555     } elsif ($self->{nc} == 0x003E) { # >
6556     ## XML5: Same as "anything else".
6557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6559    
6560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6561     $self->{line_prev} = $self->{line};
6562     $self->{column_prev} = $self->{column};
6563     $self->{column}++;
6564     $self->{nc}
6565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6566     } else {
6567     $self->{set_nc}->($self);
6568     }
6569    
6570     return ($self->{ct}); # ATTLIST
6571     redo A;
6572     } elsif ($self->{nc} == 0x0028) { # (
6573     ## XML5: Same as "anything else".
6574     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6575    
6576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6577     $self->{line_prev} = $self->{line};
6578     $self->{column_prev} = $self->{column};
6579     $self->{column}++;
6580     $self->{nc}
6581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6582     } else {
6583     $self->{set_nc}->($self);
6584     }
6585    
6586     redo A;
6587     } elsif ($self->{nc} == -1) {
6588     ## XML5: No parse error.
6589     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6590     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6591    
6592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6593     $self->{line_prev} = $self->{line};
6594     $self->{column_prev} = $self->{column};
6595     $self->{column}++;
6596     $self->{nc}
6597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6598     } else {
6599     $self->{set_nc}->($self);
6600     }
6601    
6602     return ($self->{ct});
6603     redo A;
6604     } else {
6605     ## XML5: Not defined yet.
6606     $self->{ca}->{type} = chr $self->{nc};
6607     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6608    
6609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610     $self->{line_prev} = $self->{line};
6611     $self->{column_prev} = $self->{column};
6612     $self->{column}++;
6613     $self->{nc}
6614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615     } else {
6616     $self->{set_nc}->($self);
6617     }
6618    
6619     redo A;
6620     }
6621     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6622     if ($is_space->{$self->{nc}}) {
6623     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6624    
6625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6626     $self->{line_prev} = $self->{line};
6627     $self->{column_prev} = $self->{column};
6628     $self->{column}++;
6629     $self->{nc}
6630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6631     } else {
6632     $self->{set_nc}->($self);
6633     }
6634    
6635     redo A;
6636     } elsif ($self->{nc} == 0x0023) { # #
6637     ## XML5: Same as "anything else".
6638     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6639     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6640    
6641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6642     $self->{line_prev} = $self->{line};
6643     $self->{column_prev} = $self->{column};
6644     $self->{column}++;
6645     $self->{nc}
6646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6647     } else {
6648     $self->{set_nc}->($self);
6649     }
6650    
6651     redo A;
6652     } elsif ($self->{nc} == 0x0022) { # "
6653     ## XML5: Same as "anything else".
6654     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6655     $self->{ca}->{value} = '';
6656     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6657    
6658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6659     $self->{line_prev} = $self->{line};
6660     $self->{column_prev} = $self->{column};
6661     $self->{column}++;
6662     $self->{nc}
6663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6664     } else {
6665     $self->{set_nc}->($self);
6666     }
6667    
6668     redo A;
6669     } elsif ($self->{nc} == 0x0027) { # '
6670     ## XML5: Same as "anything else".
6671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6672     $self->{ca}->{value} = '';
6673     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6674    
6675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6676     $self->{line_prev} = $self->{line};
6677     $self->{column_prev} = $self->{column};
6678     $self->{column}++;
6679     $self->{nc}
6680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6681     } else {
6682     $self->{set_nc}->($self);
6683     }
6684    
6685     redo A;
6686     } elsif ($self->{nc} == 0x003E) { # >
6687     ## XML5: Same as "anything else".
6688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6689     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6690    
6691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6692     $self->{line_prev} = $self->{line};
6693     $self->{column_prev} = $self->{column};
6694     $self->{column}++;
6695     $self->{nc}
6696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6697     } else {
6698     $self->{set_nc}->($self);
6699     }
6700    
6701     return ($self->{ct}); # ATTLIST
6702     redo A;
6703     } elsif ($self->{nc} == 0x0028) { # (
6704     ## XML5: Same as "anything else".
6705     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6706     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6707    
6708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6709     $self->{line_prev} = $self->{line};
6710     $self->{column_prev} = $self->{column};
6711     $self->{column}++;
6712     $self->{nc}
6713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6714     } else {
6715     $self->{set_nc}->($self);
6716     }
6717    
6718     redo A;
6719     } elsif ($self->{nc} == -1) {
6720     ## XML5: No parse error.
6721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6723    
6724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6725     $self->{line_prev} = $self->{line};
6726     $self->{column_prev} = $self->{column};
6727     $self->{column}++;
6728     $self->{nc}
6729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6730     } else {
6731     $self->{set_nc}->($self);
6732     }
6733    
6734     return ($self->{ct});
6735     redo A;
6736     } else {
6737     ## XML5: Not defined yet.
6738     $self->{ca}->{type} .= chr $self->{nc};
6739     ## Stay in the state.
6740    
6741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6742     $self->{line_prev} = $self->{line};
6743     $self->{column_prev} = $self->{column};
6744     $self->{column}++;
6745     $self->{nc}
6746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6747     } else {
6748     $self->{set_nc}->($self);
6749     }
6750    
6751     redo A;
6752     }
6753     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6754     if ($is_space->{$self->{nc}}) {
6755     ## Stay in the state.
6756    
6757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6758     $self->{line_prev} = $self->{line};
6759     $self->{column_prev} = $self->{column};
6760     $self->{column}++;
6761     $self->{nc}
6762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6763     } else {
6764     $self->{set_nc}->($self);
6765     }
6766    
6767     redo A;
6768     } elsif ($self->{nc} == 0x0028) { # (
6769     ## XML5: Same as "anything else".
6770     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6771    
6772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773     $self->{line_prev} = $self->{line};
6774     $self->{column_prev} = $self->{column};
6775     $self->{column}++;
6776     $self->{nc}
6777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778     } else {
6779     $self->{set_nc}->($self);
6780     }
6781    
6782     redo A;
6783     } elsif ($self->{nc} == 0x0023) { # #
6784     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6785    
6786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6787     $self->{line_prev} = $self->{line};
6788     $self->{column_prev} = $self->{column};
6789     $self->{column}++;
6790     $self->{nc}
6791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6792     } else {
6793     $self->{set_nc}->($self);
6794     }
6795    
6796     redo A;
6797     } elsif ($self->{nc} == 0x0022) { # "
6798     ## XML5: Same as "anything else".
6799     $self->{ca}->{value} = '';
6800     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6801    
6802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6803     $self->{line_prev} = $self->{line};
6804     $self->{column_prev} = $self->{column};
6805     $self->{column}++;
6806     $self->{nc}
6807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6808     } else {
6809     $self->{set_nc}->($self);
6810     }
6811    
6812     redo A;
6813     } elsif ($self->{nc} == 0x0027) { # '
6814     ## XML5: Same as "anything else".
6815     $self->{ca}->{value} = '';
6816     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6817    
6818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6819     $self->{line_prev} = $self->{line};
6820     $self->{column_prev} = $self->{column};
6821     $self->{column}++;
6822     $self->{nc}
6823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6824     } else {
6825     $self->{set_nc}->($self);
6826     }
6827    
6828     redo A;
6829     } elsif ($self->{nc} == 0x003E) { # >
6830     ## XML5: Same as "anything else".
6831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6833    
6834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835     $self->{line_prev} = $self->{line};
6836     $self->{column_prev} = $self->{column};
6837     $self->{column}++;
6838     $self->{nc}
6839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840     } else {
6841     $self->{set_nc}->($self);
6842     }
6843    
6844     return ($self->{ct}); # ATTLIST
6845     redo A;
6846     } elsif ($self->{nc} == -1) {
6847     ## XML5: No parse error.
6848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6850    
6851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6852     $self->{line_prev} = $self->{line};
6853     $self->{column_prev} = $self->{column};
6854     $self->{column}++;
6855     $self->{nc}
6856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6857     } else {
6858     $self->{set_nc}->($self);
6859     }
6860    
6861     return ($self->{ct});
6862     redo A;
6863     } else {
6864     ## XML5: Switch to the "DOCTYPE bogus comment state".
6865     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6866     $self->{ca}->{value} = '';
6867     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6868     ## Reconsume.
6869     redo A;
6870     }
6871     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6872     if ($is_space->{$self->{nc}}) {
6873     ## Stay in the state.
6874    
6875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876     $self->{line_prev} = $self->{line};
6877     $self->{column_prev} = $self->{column};
6878     $self->{column}++;
6879     $self->{nc}
6880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881     } else {
6882     $self->{set_nc}->($self);
6883     }
6884    
6885     redo A;
6886     } elsif ($self->{nc} == 0x007C) { # |
6887     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6888     ## Stay in the state.
6889    
6890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6891     $self->{line_prev} = $self->{line};
6892     $self->{column_prev} = $self->{column};
6893     $self->{column}++;
6894     $self->{nc}
6895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6896     } else {
6897     $self->{set_nc}->($self);
6898     }
6899    
6900     redo A;
6901     } elsif ($self->{nc} == 0x0029) { # )
6902     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6903     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6904    
6905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6906     $self->{line_prev} = $self->{line};
6907     $self->{column_prev} = $self->{column};
6908     $self->{column}++;
6909     $self->{nc}
6910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6911     } else {
6912     $self->{set_nc}->($self);
6913     }
6914    
6915     redo A;
6916     } elsif ($self->{nc} == 0x003E) { # >
6917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6919    
6920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6921     $self->{line_prev} = $self->{line};
6922     $self->{column_prev} = $self->{column};
6923     $self->{column}++;
6924     $self->{nc}
6925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6926     } else {
6927     $self->{set_nc}->($self);
6928     }
6929    
6930     return ($self->{ct}); # ATTLIST
6931     redo A;
6932     } elsif ($self->{nc} == -1) {
6933     ## XML5: No parse error.
6934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6935     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6936    
6937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6938     $self->{line_prev} = $self->{line};
6939     $self->{column_prev} = $self->{column};
6940     $self->{column}++;
6941     $self->{nc}
6942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6943     } else {
6944     $self->{set_nc}->($self);
6945     }
6946    
6947     return ($self->{ct});
6948     redo A;
6949     } else {
6950     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6951     $self->{state} = ALLOWED_TOKEN_STATE;
6952    
6953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6954     $self->{line_prev} = $self->{line};
6955     $self->{column_prev} = $self->{column};
6956     $self->{column}++;
6957     $self->{nc}
6958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6959     } else {
6960     $self->{set_nc}->($self);
6961     }
6962    
6963     redo A;
6964     }
6965     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6966     if ($is_space->{$self->{nc}}) {
6967     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6968    
6969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6970     $self->{line_prev} = $self->{line};
6971     $self->{column_prev} = $self->{column};
6972     $self->{column}++;
6973     $self->{nc}
6974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6975     } else {
6976     $self->{set_nc}->($self);
6977     }
6978    
6979     redo A;
6980     } elsif ($self->{nc} == 0x007C) { # |
6981     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6982    
6983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6984     $self->{line_prev} = $self->{line};
6985     $self->{column_prev} = $self->{column};
6986     $self->{column}++;
6987     $self->{nc}
6988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6989     } else {
6990     $self->{set_nc}->($self);
6991     }
6992    
6993     redo A;
6994     } elsif ($self->{nc} == 0x0029) { # )
6995     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6996    
6997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6998     $self->{line_prev} = $self->{line};
6999     $self->{column_prev} = $self->{column};
7000     $self->{column}++;
7001     $self->{nc}
7002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7003     } else {
7004     $self->{set_nc}->($self);
7005     }
7006    
7007     redo A;
7008     } elsif ($self->{nc} == 0x003E) { # >
7009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7010     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7011    
7012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7013     $self->{line_prev} = $self->{line};
7014     $self->{column_prev} = $self->{column};
7015     $self->{column}++;
7016     $self->{nc}
7017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7018     } else {
7019     $self->{set_nc}->($self);
7020     }
7021    
7022     return ($self->{ct}); # ATTLIST
7023     redo A;
7024     } elsif ($self->{nc} == -1) {
7025     ## XML5: No parse error.
7026     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7027     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7028    
7029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7030     $self->{line_prev} = $self->{line};
7031     $self->{column_prev} = $self->{column};
7032     $self->{column}++;
7033     $self->{nc}
7034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7035     } else {
7036     $self->{set_nc}->($self);
7037     }
7038    
7039     return ($self->{ct});
7040     redo A;
7041     } else {
7042     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7043     ## Stay in the state.
7044    
7045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7046     $self->{line_prev} = $self->{line};
7047     $self->{column_prev} = $self->{column};
7048     $self->{column}++;
7049     $self->{nc}
7050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7051     } else {
7052     $self->{set_nc}->($self);
7053     }
7054    
7055     redo A;
7056     }
7057     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7058     if ($is_space->{$self->{nc}}) {
7059     ## Stay in the state.
7060    
7061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7062     $self->{line_prev} = $self->{line};
7063     $self->{column_prev} = $self->{column};
7064     $self->{column}++;
7065     $self->{nc}
7066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7067     } else {
7068     $self->{set_nc}->($self);
7069     }
7070    
7071     redo A;
7072     } elsif ($self->{nc} == 0x007C) { # |
7073     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7074    
7075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7076     $self->{line_prev} = $self->{line};
7077     $self->{column_prev} = $self->{column};
7078     $self->{column}++;
7079     $self->{nc}
7080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7081     } else {
7082     $self->{set_nc}->($self);
7083     }
7084    
7085     redo A;
7086     } elsif ($self->{nc} == 0x0029) { # )
7087     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7088    
7089     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7090     $self->{line_prev} = $self->{line};
7091     $self->{column_prev} = $self->{column};
7092     $self->{column}++;
7093     $self->{nc}
7094     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7095     } else {
7096     $self->{set_nc}->($self);
7097     }
7098    
7099     redo A;
7100     } elsif ($self->{nc} == 0x003E) { # >
7101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7102     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7103    
7104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105     $self->{line_prev} = $self->{line};
7106     $self->{column_prev} = $self->{column};
7107     $self->{column}++;
7108     $self->{nc}
7109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110     } else {
7111     $self->{set_nc}->($self);
7112     }
7113    
7114     return ($self->{ct}); # ATTLIST
7115     redo A;
7116     } elsif ($self->{nc} == -1) {
7117     ## XML5: No parse error.
7118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7119     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7120    
7121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122     $self->{line_prev} = $self->{line};
7123     $self->{column_prev} = $self->{column};
7124     $self->{column}++;
7125     $self->{nc}
7126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127     } else {
7128     $self->{set_nc}->($self);
7129     }
7130    
7131     return ($self->{ct});
7132     redo A;
7133     } else {
7134     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7135     line => $self->{line_prev},
7136     column => $self->{column_prev});
7137     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7138     $self->{state} = ALLOWED_TOKEN_STATE;
7139    
7140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7141     $self->{line_prev} = $self->{line};
7142     $self->{column_prev} = $self->{column};
7143     $self->{column}++;
7144     $self->{nc}
7145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7146     } else {
7147     $self->{set_nc}->($self);
7148     }
7149    
7150     redo A;
7151     }
7152     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7153     if ($is_space->{$self->{nc}}) {
7154     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7155    
7156     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7157     $self->{line_prev} = $self->{line};
7158     $self->{column_prev} = $self->{column};
7159     $self->{column}++;
7160     $self->{nc}
7161     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7162     } else {
7163     $self->{set_nc}->($self);
7164     }
7165    
7166     redo A;
7167     } elsif ($self->{nc} == 0x0023) { # #
7168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7169     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7170    
7171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7172     $self->{line_prev} = $self->{line};
7173     $self->{column_prev} = $self->{column};
7174     $self->{column}++;
7175     $self->{nc}
7176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7177     } else {
7178     $self->{set_nc}->($self);
7179     }
7180    
7181     redo A;
7182     } elsif ($self->{nc} == 0x0022) { # "
7183     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7184     $self->{ca}->{value} = '';
7185     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7186    
7187     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7188     $self->{line_prev} = $self->{line};
7189     $self->{column_prev} = $self->{column};
7190     $self->{column}++;
7191     $self->{nc}
7192     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7193     } else {
7194     $self->{set_nc}->($self);
7195     }
7196    
7197     redo A;
7198     } elsif ($self->{nc} == 0x0027) { # '
7199     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7200     $self->{ca}->{value} = '';
7201     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7202    
7203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7204     $self->{line_prev} = $self->{line};
7205     $self->{column_prev} = $self->{column};
7206     $self->{column}++;
7207     $self->{nc}
7208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7209     } else {
7210     $self->{set_nc}->($self);
7211     }
7212    
7213     redo A;
7214     } elsif ($self->{nc} == 0x003E) { # >
7215     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7216     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7217    
7218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7219     $self->{line_prev} = $self->{line};
7220     $self->{column_prev} = $self->{column};
7221     $self->{column}++;
7222     $self->{nc}
7223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7224     } else {
7225     $self->{set_nc}->($self);
7226     }
7227    
7228     return ($self->{ct}); # ATTLIST
7229     redo A;
7230     } elsif ($self->{nc} == -1) {
7231     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7232     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7233    
7234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7235     $self->{line_prev} = $self->{line};
7236     $self->{column_prev} = $self->{column};
7237     $self->{column}++;
7238     $self->{nc}
7239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7240     } else {
7241     $self->{set_nc}->($self);
7242     }
7243    
7244     return ($self->{ct});
7245     redo A;
7246     } else {
7247     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7248     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7249     ## Reconsume.
7250     redo A;
7251     }
7252     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7253     if ($is_space->{$self->{nc}}) {
7254     ## Stay in the state.
7255    
7256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7257     $self->{line_prev} = $self->{line};
7258     $self->{column_prev} = $self->{column};
7259     $self->{column}++;
7260     $self->{nc}
7261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7262     } else {
7263     $self->{set_nc}->($self);
7264     }
7265    
7266     redo A;
7267     } elsif ($self->{nc} == 0x0023) { # #
7268     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7269    
7270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271     $self->{line_prev} = $self->{line};
7272     $self->{column_prev} = $self->{column};
7273     $self->{column}++;
7274     $self->{nc}
7275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276     } else {
7277     $self->{set_nc}->($self);
7278     }
7279    
7280     redo A;
7281     } elsif ($self->{nc} == 0x0022) { # "
7282     $self->{ca}->{value} = '';
7283     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7284    
7285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7286     $self->{line_prev} = $self->{line};
7287     $self->{column_prev} = $self->{column};
7288     $self->{column}++;
7289     $self->{nc}
7290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7291     } else {
7292     $self->{set_nc}->($self);
7293     }
7294    
7295     redo A;
7296     } elsif ($self->{nc} == 0x0027) { # '
7297     $self->{ca}->{value} = '';
7298     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7299    
7300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7301     $self->{line_prev} = $self->{line};
7302     $self->{column_prev} = $self->{column};
7303     $self->{column}++;
7304     $self->{nc}
7305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7306     } else {
7307     $self->{set_nc}->($self);
7308     }
7309    
7310     redo A;
7311     } elsif ($self->{nc} == 0x003E) { # >
7312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7313     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7314    
7315     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7316     $self->{line_prev} = $self->{line};
7317     $self->{column_prev} = $self->{column};
7318     $self->{column}++;
7319     $self->{nc}
7320     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7321     } else {
7322     $self->{set_nc}->($self);
7323     }
7324    
7325     return ($self->{ct}); # ATTLIST
7326     redo A;
7327     } elsif ($self->{nc} == -1) {
7328     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7329     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7330    
7331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7332     $self->{line_prev} = $self->{line};
7333     $self->{column_prev} = $self->{column};
7334     $self->{column}++;
7335     $self->{nc}
7336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7337     } else {
7338     $self->{set_nc}->($self);
7339     }
7340    
7341     return ($self->{ct});
7342     redo A;
7343     } else {
7344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7345     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7346     ## Reconsume.
7347     redo A;
7348     }
7349     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7350     if ($is_space->{$self->{nc}}) {
7351     ## XML5: No parse error.
7352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7353 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7354 wakaba 1.15 ## Reconsume.
7355     redo A;
7356     } elsif ($self->{nc} == 0x0022) { # "
7357     ## XML5: Same as "anything else".
7358     $self->{ca}->{value} = '';
7359     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7360    
7361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7362     $self->{line_prev} = $self->{line};
7363     $self->{column_prev} = $self->{column};
7364     $self->{column}++;
7365     $self->{nc}
7366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7367     } else {
7368     $self->{set_nc}->($self);
7369     }
7370    
7371     redo A;
7372     } elsif ($self->{nc} == 0x0027) { # '
7373     ## XML5: Same as "anything else".
7374     $self->{ca}->{value} = '';
7375     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7376    
7377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7378     $self->{line_prev} = $self->{line};
7379     $self->{column_prev} = $self->{column};
7380     $self->{column}++;
7381     $self->{nc}
7382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7383     } else {
7384     $self->{set_nc}->($self);
7385     }
7386    
7387     redo A;
7388     } elsif ($self->{nc} == 0x003E) { # >
7389     ## XML5: Same as "anything else".
7390     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7392    
7393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7394     $self->{line_prev} = $self->{line};
7395     $self->{column_prev} = $self->{column};
7396     $self->{column}++;
7397     $self->{nc}
7398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7399     } else {
7400     $self->{set_nc}->($self);
7401     }
7402    
7403     return ($self->{ct}); # ATTLIST
7404     redo A;
7405     } elsif ($self->{nc} == -1) {
7406     ## XML5: No parse error.
7407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7408     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7409    
7410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7411     $self->{line_prev} = $self->{line};
7412     $self->{column_prev} = $self->{column};
7413     $self->{column}++;
7414     $self->{nc}
7415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7416     } else {
7417     $self->{set_nc}->($self);
7418     }
7419    
7420     return ($self->{ct});
7421     redo A;
7422     } else {
7423     $self->{ca}->{default} = chr $self->{nc};
7424     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7425    
7426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7427     $self->{line_prev} = $self->{line};
7428     $self->{column_prev} = $self->{column};
7429     $self->{column}++;
7430     $self->{nc}
7431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7432     } else {
7433     $self->{set_nc}->($self);
7434     }
7435    
7436     redo A;
7437     }
7438     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7439     if ($is_space->{$self->{nc}}) {
7440     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7441    
7442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7443     $self->{line_prev} = $self->{line};
7444     $self->{column_prev} = $self->{column};
7445     $self->{column}++;
7446     $self->{nc}
7447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7448     } else {
7449     $self->{set_nc}->($self);
7450     }
7451    
7452     redo A;
7453     } elsif ($self->{nc} == 0x0022) { # "
7454     ## XML5: Same as "anything else".
7455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7456     $self->{ca}->{value} = '';
7457     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7458    
7459     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7460     $self->{line_prev} = $self->{line};
7461     $self->{column_prev} = $self->{column};
7462     $self->{column}++;
7463     $self->{nc}
7464     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7465     } else {
7466     $self->{set_nc}->($self);
7467     }
7468    
7469     redo A;
7470     } elsif ($self->{nc} == 0x0027) { # '
7471     ## XML5: Same as "anything else".
7472     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7473     $self->{ca}->{value} = '';
7474     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7475    
7476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7477     $self->{line_prev} = $self->{line};
7478     $self->{column_prev} = $self->{column};
7479     $self->{column}++;
7480     $self->{nc}
7481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7482     } else {
7483     $self->{set_nc}->($self);
7484     }
7485    
7486     redo A;
7487     } elsif ($self->{nc} == 0x003E) { # >
7488     ## XML5: Same as "anything else".
7489     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7490     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7491    
7492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493     $self->{line_prev} = $self->{line};
7494     $self->{column_prev} = $self->{column};
7495     $self->{column}++;
7496     $self->{nc}
7497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498     } else {
7499     $self->{set_nc}->($self);
7500     }
7501    
7502     return ($self->{ct}); # ATTLIST
7503     redo A;
7504     } elsif ($self->{nc} == -1) {
7505     ## XML5: No parse error.
7506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7507     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7508     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7509    
7510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7511     $self->{line_prev} = $self->{line};
7512     $self->{column_prev} = $self->{column};
7513     $self->{column}++;
7514     $self->{nc}
7515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7516     } else {
7517     $self->{set_nc}->($self);
7518     }
7519    
7520     return ($self->{ct});
7521     redo A;
7522     } else {
7523     $self->{ca}->{default} .= chr $self->{nc};
7524     ## Stay in the state.
7525    
7526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7527     $self->{line_prev} = $self->{line};
7528     $self->{column_prev} = $self->{column};
7529     $self->{column}++;
7530     $self->{nc}
7531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7532     } else {
7533     $self->{set_nc}->($self);
7534     }
7535    
7536     redo A;
7537     }
7538     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7539     if ($is_space->{$self->{nc}}) {
7540     ## Stay in the state.
7541    
7542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7543     $self->{line_prev} = $self->{line};
7544     $self->{column_prev} = $self->{column};
7545     $self->{column}++;
7546     $self->{nc}
7547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7548     } else {
7549     $self->{set_nc}->($self);
7550     }
7551    
7552     redo A;
7553     } elsif ($self->{nc} == 0x0022) { # "
7554     $self->{ca}->{value} = '';
7555     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7556    
7557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7558     $self->{line_prev} = $self->{line};
7559     $self->{column_prev} = $self->{column};
7560     $self->{column}++;
7561     $self->{nc}
7562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7563     } else {
7564     $self->{set_nc}->($self);
7565     }
7566    
7567     redo A;
7568     } elsif ($self->{nc} == 0x0027) { # '
7569     $self->{ca}->{value} = '';
7570     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7571    
7572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7573     $self->{line_prev} = $self->{line};
7574     $self->{column_prev} = $self->{column};
7575     $self->{column}++;
7576     $self->{nc}
7577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7578     } else {
7579     $self->{set_nc}->($self);
7580     }
7581    
7582     redo A;
7583     } elsif ($self->{nc} == 0x003E) { # >
7584     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7585     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7586    
7587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7588     $self->{line_prev} = $self->{line};
7589     $self->{column_prev} = $self->{column};
7590     $self->{column}++;
7591     $self->{nc}
7592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7593     } else {
7594     $self->{set_nc}->($self);
7595     }
7596    
7597     return ($self->{ct}); # ATTLIST
7598     redo A;
7599     } elsif ($self->{nc} == -1) {
7600     ## XML5: No parse error.
7601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7602     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7603     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7604    
7605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7606     $self->{line_prev} = $self->{line};
7607     $self->{column_prev} = $self->{column};
7608     $self->{column}++;
7609     $self->{nc}
7610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7611     } else {
7612     $self->{set_nc}->($self);
7613     }
7614    
7615     return ($self->{ct});
7616     redo A;
7617     } else {
7618     ## XML5: Not defined yet.
7619     if ($self->{ca}->{default} eq 'FIXED') {
7620     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7621     } else {
7622     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7623     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7624     }
7625     ## Reconsume.
7626     redo A;
7627     }
7628     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7629     if ($is_space->{$self->{nc}} or
7630     $self->{nc} == -1 or
7631     $self->{nc} == 0x003E) { # >
7632     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7633     ## Reconsume.
7634     redo A;
7635     } else {
7636     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7637     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7638     ## Reconsume.
7639     redo A;
7640 wakaba 1.16 }
7641 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7642     ## ASCII case-insensitive
7643     if ($self->{nc} == [
7644     undef,
7645     0x0044, # D
7646     0x0041, # A
7647     0x0054, # T
7648     ]->[length $self->{kwd}] or
7649     $self->{nc} == [
7650     undef,
7651     0x0064, # d
7652     0x0061, # a
7653     0x0074, # t
7654     ]->[length $self->{kwd}]) {
7655    
7656     ## Stay in the state.
7657     $self->{kwd} .= chr $self->{nc};
7658    
7659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7660     $self->{line_prev} = $self->{line};
7661     $self->{column_prev} = $self->{column};
7662     $self->{column}++;
7663     $self->{nc}
7664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7665     } else {
7666     $self->{set_nc}->($self);
7667     }
7668    
7669     redo A;
7670     } elsif ((length $self->{kwd}) == 4 and
7671     ($self->{nc} == 0x0041 or # A
7672     $self->{nc} == 0x0061)) { # a
7673     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7674    
7675     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7676     text => 'NDATA',
7677     line => $self->{line_prev},
7678     column => $self->{column_prev} - 4);
7679     } else {
7680    
7681     }
7682     $self->{state} = AFTER_NDATA_STATE;
7683    
7684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7685     $self->{line_prev} = $self->{line};
7686     $self->{column_prev} = $self->{column};
7687     $self->{column}++;
7688     $self->{nc}
7689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7690     } else {
7691     $self->{set_nc}->($self);
7692     }
7693    
7694     redo A;
7695     } else {
7696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7697     line => $self->{line_prev},
7698     column => $self->{column_prev} + 1
7699     - length $self->{kwd});
7700    
7701     $self->{state} = BOGUS_MD_STATE;
7702     ## Reconsume.
7703     redo A;
7704     }
7705     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7706     if ($is_space->{$self->{nc}}) {
7707     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7708    
7709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710     $self->{line_prev} = $self->{line};
7711     $self->{column_prev} = $self->{column};
7712     $self->{column}++;
7713     $self->{nc}
7714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715     } else {
7716     $self->{set_nc}->($self);
7717     }
7718    
7719     redo A;
7720     } elsif ($self->{nc} == 0x003E) { # >
7721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7722     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7723    
7724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725     $self->{line_prev} = $self->{line};
7726     $self->{column_prev} = $self->{column};
7727     $self->{column}++;
7728     $self->{nc}
7729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730     } else {
7731     $self->{set_nc}->($self);
7732     }
7733    
7734     return ($self->{ct}); # ENTITY
7735     redo A;
7736     } elsif ($self->{nc} == -1) {
7737     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7739    
7740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7741     $self->{line_prev} = $self->{line};
7742     $self->{column_prev} = $self->{column};
7743     $self->{column}++;
7744     $self->{nc}
7745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7746     } else {
7747     $self->{set_nc}->($self);
7748     }
7749    
7750     return ($self->{ct}); # ENTITY
7751     redo A;
7752     } else {
7753     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7754     line => $self->{line_prev},
7755     column => $self->{column_prev} + 1
7756     - length $self->{kwd});
7757     $self->{state} = BOGUS_MD_STATE;
7758     ## Reconsume.
7759     redo A;
7760     }
7761     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7762     if ($is_space->{$self->{nc}}) {
7763     ## Stay in the state.
7764    
7765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7766     $self->{line_prev} = $self->{line};
7767     $self->{column_prev} = $self->{column};
7768     $self->{column}++;
7769     $self->{nc}
7770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7771     } else {
7772     $self->{set_nc}->($self);
7773     }
7774    
7775     redo A;
7776     } elsif ($self->{nc} == 0x003E) { # >
7777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7778     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7779    
7780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7781     $self->{line_prev} = $self->{line};
7782     $self->{column_prev} = $self->{column};
7783     $self->{column}++;
7784     $self->{nc}
7785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7786     } else {
7787     $self->{set_nc}->($self);
7788     }
7789    
7790     return ($self->{ct}); # ENTITY
7791     redo A;
7792     } elsif ($self->{nc} == -1) {
7793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7794     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7795    
7796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7797     $self->{line_prev} = $self->{line};
7798     $self->{column_prev} = $self->{column};
7799     $self->{column}++;
7800     $self->{nc}
7801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7802     } else {
7803     $self->{set_nc}->($self);
7804     }
7805    
7806     return ($self->{ct}); # ENTITY
7807     redo A;
7808     } else {
7809     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7810     $self->{state} = NOTATION_NAME_STATE;
7811    
7812     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7813     $self->{line_prev} = $self->{line};
7814     $self->{column_prev} = $self->{column};
7815     $self->{column}++;
7816     $self->{nc}
7817     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7818     } else {
7819     $self->{set_nc}->($self);
7820     }
7821    
7822     redo A;
7823     }
7824     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7825     if ($is_space->{$self->{nc}}) {
7826 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7827 wakaba 1.18
7828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7829     $self->{line_prev} = $self->{line};
7830     $self->{column_prev} = $self->{column};
7831     $self->{column}++;
7832     $self->{nc}
7833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7834     } else {
7835     $self->{set_nc}->($self);
7836     }
7837    
7838     redo A;
7839     } elsif ($self->{nc} == 0x003E) { # >
7840     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7841    
7842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7843     $self->{line_prev} = $self->{line};
7844     $self->{column_prev} = $self->{column};
7845     $self->{column}++;
7846     $self->{nc}
7847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7848     } else {
7849     $self->{set_nc}->($self);
7850     }
7851    
7852     return ($self->{ct}); # ENTITY
7853     redo A;
7854     } elsif ($self->{nc} == -1) {
7855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7857    
7858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7859     $self->{line_prev} = $self->{line};
7860     $self->{column_prev} = $self->{column};
7861     $self->{column}++;
7862     $self->{nc}
7863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7864     } else {
7865     $self->{set_nc}->($self);
7866     }
7867    
7868     return ($self->{ct}); # ENTITY
7869     redo A;
7870     } else {
7871     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7872     ## Stay in the state.
7873    
7874     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7875     $self->{line_prev} = $self->{line};
7876     $self->{column_prev} = $self->{column};
7877     $self->{column}++;
7878     $self->{nc}
7879     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7880     } else {
7881     $self->{set_nc}->($self);
7882     }
7883    
7884     redo A;
7885     }
7886 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7887     if ($self->{nc} == 0x0022) { # "
7888 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7889 wakaba 1.19
7890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7891     $self->{line_prev} = $self->{line};
7892     $self->{column_prev} = $self->{column};
7893     $self->{column}++;
7894     $self->{nc}
7895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7896     } else {
7897     $self->{set_nc}->($self);
7898     }
7899    
7900     redo A;
7901     } elsif ($self->{nc} == 0x0026) { # &
7902     $self->{prev_state} = $self->{state};
7903     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7904     $self->{entity_add} = 0x0022; # "
7905    
7906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907     $self->{line_prev} = $self->{line};
7908     $self->{column_prev} = $self->{column};
7909     $self->{column}++;
7910     $self->{nc}
7911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912     } else {
7913     $self->{set_nc}->($self);
7914     }
7915    
7916     redo A;
7917     ## TODO: %
7918     } elsif ($self->{nc} == -1) {
7919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7920     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7921     ## Reconsume.
7922     return ($self->{ct}); # ENTITY
7923     redo A;
7924     } else {
7925     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7926    
7927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7928     $self->{line_prev} = $self->{line};
7929     $self->{column_prev} = $self->{column};
7930     $self->{column}++;
7931     $self->{nc}
7932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7933     } else {
7934     $self->{set_nc}->($self);
7935     }
7936    
7937     redo A;
7938     }
7939     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7940     if ($self->{nc} == 0x0027) { # '
7941 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7942 wakaba 1.19
7943     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7944     $self->{line_prev} = $self->{line};
7945     $self->{column_prev} = $self->{column};
7946     $self->{column}++;
7947     $self->{nc}
7948     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7949     } else {
7950     $self->{set_nc}->($self);
7951     }
7952    
7953     redo A;
7954     } elsif ($self->{nc} == 0x0026) { # &
7955     $self->{prev_state} = $self->{state};
7956     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7957     $self->{entity_add} = 0x0027; # '
7958    
7959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7960     $self->{line_prev} = $self->{line};
7961     $self->{column_prev} = $self->{column};
7962     $self->{column}++;
7963     $self->{nc}
7964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7965     } else {
7966     $self->{set_nc}->($self);
7967     }
7968    
7969     redo A;
7970     ## TODO: %
7971     } elsif ($self->{nc} == -1) {
7972     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7974     ## Reconsume.
7975     return ($self->{ct}); # ENTITY
7976     redo A;
7977     } else {
7978     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7979    
7980     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7981     $self->{line_prev} = $self->{line};
7982     $self->{column_prev} = $self->{column};
7983     $self->{column}++;
7984     $self->{nc}
7985     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7986     } else {
7987     $self->{set_nc}->($self);
7988     }
7989    
7990     redo A;
7991     }
7992     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7993     if ($is_space->{$self->{nc}} or
7994     {
7995     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7996     $self->{entity_add} => 1,
7997     }->{$self->{nc}}) {
7998 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7999     line => $self->{line_prev},
8000     column => $self->{column_prev}
8001     + ($self->{nc} == -1 ? 1 : 0));
8002 wakaba 1.19 ## Don't consume
8003     ## Return nothing.
8004     #
8005     } elsif ($self->{nc} == 0x0023) { # #
8006     $self->{ca} = $self->{ct};
8007     $self->{state} = ENTITY_HASH_STATE;
8008     $self->{kwd} = '#';
8009    
8010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8011     $self->{line_prev} = $self->{line};
8012     $self->{column_prev} = $self->{column};
8013     $self->{column}++;
8014     $self->{nc}
8015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8016     } else {
8017     $self->{set_nc}->($self);
8018     }
8019    
8020     redo A;
8021     } else {
8022     #
8023     }
8024    
8025     $self->{ct}->{value} .= '&';
8026     $self->{state} = $self->{prev_state};
8027     ## Reconsume.
8028     redo A;
8029 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8030     if ($is_space->{$self->{nc}}) {
8031     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8032    
8033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8034     $self->{line_prev} = $self->{line};
8035     $self->{column_prev} = $self->{column};
8036     $self->{column}++;
8037     $self->{nc}
8038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8039     } else {
8040     $self->{set_nc}->($self);
8041     }
8042    
8043     redo A;
8044     } elsif ($self->{nc} == 0x0028) { # (
8045     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8046     $self->{ct}->{content} = ['('];
8047     $self->{group_depth} = 1;
8048    
8049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8050     $self->{line_prev} = $self->{line};
8051     $self->{column_prev} = $self->{column};
8052     $self->{column}++;
8053     $self->{nc}
8054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8055     } else {
8056     $self->{set_nc}->($self);
8057     }
8058    
8059     redo A;
8060     } elsif ($self->{nc} == 0x003E) { # >
8061     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8063    
8064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8065     $self->{line_prev} = $self->{line};
8066     $self->{column_prev} = $self->{column};
8067     $self->{column}++;
8068     $self->{nc}
8069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8070     } else {
8071     $self->{set_nc}->($self);
8072     }
8073    
8074     return ($self->{ct}); # ELEMENT
8075     redo A;
8076     } elsif ($self->{nc} == -1) {
8077     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8078     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8079    
8080     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8081     $self->{line_prev} = $self->{line};
8082     $self->{column_prev} = $self->{column};
8083     $self->{column}++;
8084     $self->{nc}
8085     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8086     } else {
8087     $self->{set_nc}->($self);
8088     }
8089    
8090     return ($self->{ct}); # ELEMENT
8091     redo A;
8092     } else {
8093     $self->{ct}->{content} = [chr $self->{nc}];
8094     $self->{state} = CONTENT_KEYWORD_STATE;
8095    
8096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8097     $self->{line_prev} = $self->{line};
8098     $self->{column_prev} = $self->{column};
8099     $self->{column}++;
8100     $self->{nc}
8101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8102     } else {
8103     $self->{set_nc}->($self);
8104     }
8105    
8106     redo A;
8107     }
8108     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8109     if ($is_space->{$self->{nc}}) {
8110     $self->{state} = AFTER_MD_DEF_STATE;
8111    
8112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8113     $self->{line_prev} = $self->{line};
8114     $self->{column_prev} = $self->{column};
8115     $self->{column}++;
8116     $self->{nc}
8117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8118     } else {
8119     $self->{set_nc}->($self);
8120     }
8121    
8122     redo A;
8123     } elsif ($self->{nc} == 0x003E) { # >
8124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8125    
8126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127     $self->{line_prev} = $self->{line};
8128     $self->{column_prev} = $self->{column};
8129     $self->{column}++;
8130     $self->{nc}
8131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132     } else {
8133     $self->{set_nc}->($self);
8134     }
8135    
8136     return ($self->{ct}); # ELEMENT
8137     redo A;
8138     } elsif ($self->{nc} == -1) {
8139     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141    
8142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8143     $self->{line_prev} = $self->{line};
8144     $self->{column_prev} = $self->{column};
8145     $self->{column}++;
8146     $self->{nc}
8147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8148     } else {
8149     $self->{set_nc}->($self);
8150     }
8151    
8152     return ($self->{ct}); # ELEMENT
8153     redo A;
8154     } else {
8155     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8156     ## Stay in the state.
8157    
8158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8159     $self->{line_prev} = $self->{line};
8160     $self->{column_prev} = $self->{column};
8161     $self->{column}++;
8162     $self->{nc}
8163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8164     } else {
8165     $self->{set_nc}->($self);
8166     }
8167    
8168     redo A;
8169     }
8170     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8171     if ($is_space->{$self->{nc}}) {
8172     ## Stay in the state.
8173    
8174     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8175     $self->{line_prev} = $self->{line};
8176     $self->{column_prev} = $self->{column};
8177     $self->{column}++;
8178     $self->{nc}
8179     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8180     } else {
8181     $self->{set_nc}->($self);
8182     }
8183    
8184     redo A;
8185     } elsif ($self->{nc} == 0x0028) { # (
8186     $self->{group_depth}++;
8187     push @{$self->{ct}->{content}}, chr $self->{nc};
8188     ## Stay in the state.
8189    
8190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8191     $self->{line_prev} = $self->{line};
8192     $self->{column_prev} = $self->{column};
8193     $self->{column}++;
8194     $self->{nc}
8195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8196     } else {
8197     $self->{set_nc}->($self);
8198     }
8199    
8200     redo A;
8201     } elsif ($self->{nc} == 0x007C or # |
8202     $self->{nc} == 0x002C) { # ,
8203     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8204     ## Stay in the state.
8205    
8206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8207     $self->{line_prev} = $self->{line};
8208     $self->{column_prev} = $self->{column};
8209     $self->{column}++;
8210     $self->{nc}
8211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8212     } else {
8213     $self->{set_nc}->($self);
8214     }
8215    
8216     redo A;
8217     } elsif ($self->{nc} == 0x0029) { # )
8218     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8219     push @{$self->{ct}->{content}}, chr $self->{nc};
8220     $self->{group_depth}--;
8221     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8222    
8223     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8224     $self->{line_prev} = $self->{line};
8225     $self->{column_prev} = $self->{column};
8226     $self->{column}++;
8227     $self->{nc}
8228     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8229     } else {
8230     $self->{set_nc}->($self);
8231     }
8232    
8233     redo A;
8234     } elsif ($self->{nc} == 0x003E) { # >
8235     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8236     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8237     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8238    
8239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8240     $self->{line_prev} = $self->{line};
8241     $self->{column_prev} = $self->{column};
8242     $self->{column}++;
8243     $self->{nc}
8244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8245     } else {
8246     $self->{set_nc}->($self);
8247     }
8248    
8249     return ($self->{ct}); # ELEMENT
8250     redo A;
8251     } elsif ($self->{nc} == -1) {
8252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8253     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8254     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8255    
8256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8257     $self->{line_prev} = $self->{line};
8258     $self->{column_prev} = $self->{column};
8259     $self->{column}++;
8260     $self->{nc}
8261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8262     } else {
8263     $self->{set_nc}->($self);
8264     }
8265    
8266     return ($self->{ct}); # ELEMENT
8267     redo A;
8268     } else {
8269     push @{$self->{ct}->{content}}, chr $self->{nc};
8270     $self->{state} = CM_ELEMENT_NAME_STATE;
8271    
8272     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8273     $self->{line_prev} = $self->{line};
8274     $self->{column_prev} = $self->{column};
8275     $self->{column}++;
8276     $self->{nc}
8277     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8278     } else {
8279     $self->{set_nc}->($self);
8280     }
8281    
8282     redo A;
8283     }
8284     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8285     if ($is_space->{$self->{nc}}) {
8286     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8287    
8288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8289     $self->{line_prev} = $self->{line};
8290     $self->{column_prev} = $self->{column};
8291     $self->{column}++;
8292     $self->{nc}
8293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8294     } else {
8295     $self->{set_nc}->($self);
8296     }
8297    
8298     redo A;
8299     } elsif ($self->{nc} == 0x002A or # *
8300     $self->{nc} == 0x002B or # +
8301     $self->{nc} == 0x003F) { # ?
8302     push @{$self->{ct}->{content}}, chr $self->{nc};
8303     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8304    
8305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306     $self->{line_prev} = $self->{line};
8307     $self->{column_prev} = $self->{column};
8308     $self->{column}++;
8309     $self->{nc}
8310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311     } else {
8312     $self->{set_nc}->($self);
8313     }
8314    
8315     redo A;
8316     } elsif ($self->{nc} == 0x007C or # |
8317     $self->{nc} == 0x002C) { # ,
8318     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8319     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8320    
8321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322     $self->{line_prev} = $self->{line};
8323     $self->{column_prev} = $self->{column};
8324     $self->{column}++;
8325     $self->{nc}
8326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327     } else {
8328     $self->{set_nc}->($self);
8329     }
8330    
8331     redo A;
8332     } elsif ($self->{nc} == 0x0029) { # )
8333     $self->{group_depth}--;
8334     push @{$self->{ct}->{content}}, chr $self->{nc};
8335     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8336    
8337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338     $self->{line_prev} = $self->{line};
8339     $self->{column_prev} = $self->{column};
8340     $self->{column}++;
8341     $self->{nc}
8342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343     } else {
8344     $self->{set_nc}->($self);
8345     }
8346    
8347     redo A;
8348     } elsif ($self->{nc} == 0x003E) { # >
8349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8350     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8351     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8352    
8353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8354     $self->{line_prev} = $self->{line};
8355     $self->{column_prev} = $self->{column};
8356     $self->{column}++;
8357     $self->{nc}
8358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8359     } else {
8360     $self->{set_nc}->($self);
8361     }
8362    
8363     return ($self->{ct}); # ELEMENT
8364     redo A;
8365     } elsif ($self->{nc} == -1) {
8366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8367     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369    
8370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371     $self->{line_prev} = $self->{line};
8372     $self->{column_prev} = $self->{column};
8373     $self->{column}++;
8374     $self->{nc}
8375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376     } else {
8377     $self->{set_nc}->($self);
8378     }
8379    
8380     return ($self->{ct}); # ELEMENT
8381     redo A;
8382     } else {
8383     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8384     ## Stay in the state.
8385    
8386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8387     $self->{line_prev} = $self->{line};
8388     $self->{column_prev} = $self->{column};
8389     $self->{column}++;
8390     $self->{nc}
8391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8392     } else {
8393     $self->{set_nc}->($self);
8394     }
8395    
8396     redo A;
8397     }
8398     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8399     if ($is_space->{$self->{nc}}) {
8400     ## Stay in the state.
8401    
8402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8403     $self->{line_prev} = $self->{line};
8404     $self->{column_prev} = $self->{column};
8405     $self->{column}++;
8406     $self->{nc}
8407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8408     } else {
8409     $self->{set_nc}->($self);
8410     }
8411    
8412     redo A;
8413     } elsif ($self->{nc} == 0x007C or # |
8414     $self->{nc} == 0x002C) { # ,
8415     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8416     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8417    
8418     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8419     $self->{line_prev} = $self->{line};
8420     $self->{column_prev} = $self->{column};
8421     $self->{column}++;
8422     $self->{nc}
8423     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8424     } else {
8425     $self->{set_nc}->($self);
8426     }
8427    
8428     redo A;
8429     } elsif ($self->{nc} == 0x0029) { # )
8430     $self->{group_depth}--;
8431     push @{$self->{ct}->{content}}, chr $self->{nc};
8432     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8433    
8434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8435     $self->{line_prev} = $self->{line};
8436     $self->{column_prev} = $self->{column};
8437     $self->{column}++;
8438     $self->{nc}
8439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8440     } else {
8441     $self->{set_nc}->($self);
8442     }
8443    
8444     redo A;
8445     } elsif ($self->{nc} == 0x003E) { # >
8446     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8447     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8449    
8450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8451     $self->{line_prev} = $self->{line};
8452     $self->{column_prev} = $self->{column};
8453     $self->{column}++;
8454     $self->{nc}
8455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8456     } else {
8457     $self->{set_nc}->($self);
8458     }
8459    
8460     return ($self->{ct}); # ELEMENT
8461     redo A;
8462     } elsif ($self->{nc} == -1) {
8463     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8464     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8465     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8466    
8467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8468     $self->{line_prev} = $self->{line};
8469     $self->{column_prev} = $self->{column};
8470     $self->{column}++;
8471     $self->{nc}
8472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8473     } else {
8474     $self->{set_nc}->($self);
8475     }
8476    
8477     return ($self->{ct}); # ELEMENT
8478     redo A;
8479     } else {
8480     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8481     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8482     $self->{state} = BOGUS_MD_STATE;
8483    
8484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485     $self->{line_prev} = $self->{line};
8486     $self->{column_prev} = $self->{column};
8487     $self->{column}++;
8488     $self->{nc}
8489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490     } else {
8491     $self->{set_nc}->($self);
8492     }
8493    
8494     redo A;
8495     }
8496     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8497     if ($is_space->{$self->{nc}}) {
8498     if ($self->{group_depth}) {
8499     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8500     } else {
8501     $self->{state} = AFTER_MD_DEF_STATE;
8502     }
8503    
8504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8505     $self->{line_prev} = $self->{line};
8506     $self->{column_prev} = $self->{column};
8507     $self->{column}++;
8508     $self->{nc}
8509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8510     } else {
8511     $self->{set_nc}->($self);
8512     }
8513    
8514     redo A;
8515     } elsif ($self->{nc} == 0x002A or # *
8516     $self->{nc} == 0x002B or # +
8517     $self->{nc} == 0x003F) { # ?
8518     push @{$self->{ct}->{content}}, chr $self->{nc};
8519     if ($self->{group_depth}) {
8520     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8521     } else {
8522     $self->{state} = AFTER_MD_DEF_STATE;
8523     }
8524    
8525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8526     $self->{line_prev} = $self->{line};
8527     $self->{column_prev} = $self->{column};
8528     $self->{column}++;
8529     $self->{nc}
8530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8531     } else {
8532     $self->{set_nc}->($self);
8533     }
8534    
8535     redo A;
8536     } elsif ($self->{nc} == 0x0029) { # )
8537     if ($self->{group_depth}) {
8538     $self->{group_depth}--;
8539     push @{$self->{ct}->{content}}, chr $self->{nc};
8540     ## Stay in the state.
8541    
8542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543     $self->{line_prev} = $self->{line};
8544     $self->{column_prev} = $self->{column};
8545     $self->{column}++;
8546     $self->{nc}
8547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548     } else {
8549     $self->{set_nc}->($self);
8550     }
8551    
8552     redo A;
8553     } else {
8554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8555     $self->{state} = BOGUS_MD_STATE;
8556     ## Reconsume.
8557     redo A;
8558     }
8559     } elsif ($self->{nc} == 0x003E) { # >
8560     if ($self->{group_depth}) {
8561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8562     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8563     }
8564     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8565    
8566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8567     $self->{line_prev} = $self->{line};
8568     $self->{column_prev} = $self->{column};
8569     $self->{column}++;
8570     $self->{nc}
8571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8572     } else {
8573     $self->{set_nc}->($self);
8574     }
8575    
8576     return ($self->{ct}); # ELEMENT
8577     redo A;
8578     } elsif ($self->{nc} == -1) {
8579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8580     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8581     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8582    
8583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8584     $self->{line_prev} = $self->{line};
8585     $self->{column_prev} = $self->{column};
8586     $self->{column}++;
8587     $self->{nc}
8588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8589     } else {
8590     $self->{set_nc}->($self);
8591     }
8592    
8593     return ($self->{ct}); # ELEMENT
8594     redo A;
8595     } else {
8596     if ($self->{group_depth}) {
8597     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8598     } else {
8599     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8600     $self->{state} = BOGUS_MD_STATE;
8601     }
8602     ## Reconsume.
8603     redo A;
8604     }
8605     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8606 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8607     ## Stay in the state.
8608    
8609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8610     $self->{line_prev} = $self->{line};
8611     $self->{column_prev} = $self->{column};
8612     $self->{column}++;
8613     $self->{nc}
8614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8615     } else {
8616     $self->{set_nc}->($self);
8617     }
8618    
8619     redo A;
8620     } elsif ($self->{nc} == 0x003E) { # >
8621     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8622    
8623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8624     $self->{line_prev} = $self->{line};
8625     $self->{column_prev} = $self->{column};
8626     $self->{column}++;
8627     $self->{nc}
8628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8629     } else {
8630     $self->{set_nc}->($self);
8631     }
8632    
8633 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8634 wakaba 1.18 redo A;
8635     } elsif ($self->{nc} == -1) {
8636     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8637     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8638    
8639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8640     $self->{line_prev} = $self->{line};
8641     $self->{column_prev} = $self->{column};
8642     $self->{column}++;
8643     $self->{nc}
8644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8645     } else {
8646     $self->{set_nc}->($self);
8647     }
8648    
8649 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8650 wakaba 1.18 redo A;
8651     } else {
8652 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8653 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8654     ## Reconsume.
8655     redo A;
8656     }
8657 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8658     if ($self->{nc} == 0x003E) { # >
8659     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8660    
8661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8662     $self->{line_prev} = $self->{line};
8663     $self->{column_prev} = $self->{column};
8664     $self->{column}++;
8665     $self->{nc}
8666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8667     } else {
8668     $self->{set_nc}->($self);
8669     }
8670    
8671     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8672     redo A;
8673     } elsif ($self->{nc} == -1) {
8674     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8675     ## Reconsume.
8676     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8677     redo A;
8678     } else {
8679     ## Stay in the state.
8680    
8681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8682     $self->{line_prev} = $self->{line};
8683     $self->{column_prev} = $self->{column};
8684     $self->{column}++;
8685     $self->{nc}
8686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8687     } else {
8688     $self->{set_nc}->($self);
8689     }
8690    
8691     redo A;
8692     }
8693 wakaba 1.1 } else {
8694     die "$0: $self->{state}: Unknown state";
8695     }
8696     } # A
8697    
8698     die "$0: _get_next_token: unexpected case";
8699     } # _get_next_token
8700    
8701     1;
8702 wakaba 1.29 ## $Date: 2009/07/05 04:38:45 $
8703 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24