/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.27 - (hide annotations) (download)
Thu Jul 2 22:24:28 2009 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.26: +2 -6 lines
++ whatpm/Whatpm/HTML/ChangeLog	2 Jul 2009 22:24:21 -0000
	* Tokenizer.pm.src: Reduced a parse error (HTML5 revision 3194).

2009-07-03  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.27 our $VERSION=do{my @r=(q$Revision: 1.26 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188     sub AFTER_ELEMENT_NAME_STATE () { 93 }
189     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190     sub CONTENT_KEYWORD_STATE () { 95 }
191     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192     sub CM_ELEMENT_NAME_STATE () { 97 }
193     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195     sub AFTER_MD_DEF_STATE () { 100 }
196     sub BOGUS_MD_STATE () { 101 }
197 wakaba 1.8
198 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
199     ## list and descriptions)
200    
201     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202     sub FOREIGN_EL () { 0b1_00000000000 }
203    
204     ## Character reference mappings
205    
206     my $charref_map = {
207     0x0D => 0x000A,
208     0x80 => 0x20AC,
209     0x81 => 0xFFFD,
210     0x82 => 0x201A,
211     0x83 => 0x0192,
212     0x84 => 0x201E,
213     0x85 => 0x2026,
214     0x86 => 0x2020,
215     0x87 => 0x2021,
216     0x88 => 0x02C6,
217     0x89 => 0x2030,
218     0x8A => 0x0160,
219     0x8B => 0x2039,
220     0x8C => 0x0152,
221     0x8D => 0xFFFD,
222     0x8E => 0x017D,
223     0x8F => 0xFFFD,
224     0x90 => 0xFFFD,
225     0x91 => 0x2018,
226     0x92 => 0x2019,
227     0x93 => 0x201C,
228     0x94 => 0x201D,
229     0x95 => 0x2022,
230     0x96 => 0x2013,
231     0x97 => 0x2014,
232     0x98 => 0x02DC,
233     0x99 => 0x2122,
234     0x9A => 0x0161,
235     0x9B => 0x203A,
236     0x9C => 0x0153,
237     0x9D => 0xFFFD,
238     0x9E => 0x017E,
239     0x9F => 0x0178,
240     }; # $charref_map
241     $charref_map->{$_} = 0xFFFD
242     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249    
250     ## Implementations MUST act as if state machine in the spec
251    
252     sub _initialize_tokenizer ($) {
253     my $self = shift;
254    
255     ## NOTE: Fields set by |new| constructor:
256     #$self->{level}
257     #$self->{set_nc}
258     #$self->{parse_error}
259 wakaba 1.3 #$self->{is_xml} (if XML)
260 wakaba 1.1
261     $self->{state} = DATA_STATE; # MUST
262 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
263     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 wakaba 1.1 #$self->{entity__value}; # initialized when used
265     #$self->{entity__match}; # initialized when used
266     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267     undef $self->{ct}; # current token
268     undef $self->{ca}; # current attribute
269     undef $self->{last_stag_name}; # last emitted start tag name
270     #$self->{prev_state}; # initialized when used
271     delete $self->{self_closing};
272     $self->{char_buffer} = '';
273     $self->{char_buffer_pos} = 0;
274     $self->{nc} = -1; # next input character
275     #$self->{next_nc}
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     $self->{token} = [];
288     # $self->{escape}
289     } # _initialize_tokenizer
290    
291     ## A token has:
292     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
295     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 wakaba 1.11 ## ->{target} (PI_TOKEN)
297 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
298     ## ->{sysid} (DOCTYPE_TOKEN)
299     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301     ## ->{name}
302     ## ->{value}
303     ## ->{has_reference} == 1 or 0
304 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
305     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312     ## while the token is pushed back to the stack.
313    
314     ## Emitted token MUST immediately be handled by the tree construction state.
315    
316     ## Before each step, UA MAY check to see if either one of the scripts in
317     ## "list of scripts that will execute as soon as possible" or the first
318     ## script in the "list of scripts that will execute asynchronously",
319     ## has completed loading. If one has, then it MUST be executed
320     ## and removed from the list.
321    
322     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323     ## (This requirement was dropped from HTML5 spec, unfortunately.)
324    
325     my $is_space = {
326     0x0009 => 1, # CHARACTER TABULATION (HT)
327     0x000A => 1, # LINE FEED (LF)
328     #0x000B => 0, # LINE TABULATION (VT)
329 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
331     0x0020 => 1, # SPACE (SP)
332     };
333    
334     sub _get_next_token ($) {
335     my $self = shift;
336    
337     if ($self->{self_closing}) {
338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339     ## NOTE: The |self_closing| flag is only set by start tag token.
340     ## In addition, when a start tag token is emitted, it is always set to
341     ## |ct|.
342     delete $self->{self_closing};
343     }
344    
345     if (@{$self->{token}}) {
346     $self->{self_closing} = $self->{token}->[0]->{self_closing};
347     return shift @{$self->{token}};
348     }
349    
350     A: {
351     if ($self->{state} == PCDATA_STATE) {
352     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353    
354     if ($self->{nc} == 0x0026) { # &
355    
356     ## NOTE: In the spec, the tokenizer is switched to the
357     ## "entity data state". In this implementation, the tokenizer
358     ## is switched to the |ENTITY_STATE|, which is an implementation
359     ## of the "consume a character reference" algorithm.
360     $self->{entity_add} = -1;
361     $self->{prev_state} = DATA_STATE;
362     $self->{state} = ENTITY_STATE;
363    
364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365     $self->{line_prev} = $self->{line};
366     $self->{column_prev} = $self->{column};
367     $self->{column}++;
368     $self->{nc}
369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370     } else {
371     $self->{set_nc}->($self);
372     }
373    
374     redo A;
375     } elsif ($self->{nc} == 0x003C) { # <
376    
377     $self->{state} = TAG_OPEN_STATE;
378    
379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380     $self->{line_prev} = $self->{line};
381     $self->{column_prev} = $self->{column};
382     $self->{column}++;
383     $self->{nc}
384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385     } else {
386     $self->{set_nc}->($self);
387     }
388    
389     redo A;
390     } elsif ($self->{nc} == -1) {
391    
392     return ({type => END_OF_FILE_TOKEN,
393     line => $self->{line}, column => $self->{column}});
394     last A; ## TODO: ok?
395     } else {
396    
397     #
398     }
399    
400     # Anything else
401     my $token = {type => CHARACTER_TOKEN,
402     data => chr $self->{nc},
403     line => $self->{line}, column => $self->{column},
404     };
405     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406    
407     ## Stay in the state.
408    
409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410     $self->{line_prev} = $self->{line};
411     $self->{column_prev} = $self->{column};
412     $self->{column}++;
413     $self->{nc}
414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415     } else {
416     $self->{set_nc}->($self);
417     }
418    
419     return ($token);
420     redo A;
421     } elsif ($self->{state} == DATA_STATE) {
422     $self->{s_kwd} = '' unless defined $self->{s_kwd};
423     if ($self->{nc} == 0x0026) { # &
424     $self->{s_kwd} = '';
425     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426     not $self->{escape}) {
427    
428     ## NOTE: In the spec, the tokenizer is switched to the
429     ## "entity data state". In this implementation, the tokenizer
430     ## is switched to the |ENTITY_STATE|, which is an implementation
431     ## of the "consume a character reference" algorithm.
432     $self->{entity_add} = -1;
433     $self->{prev_state} = DATA_STATE;
434     $self->{state} = ENTITY_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     #
450     }
451     } elsif ($self->{nc} == 0x002D) { # -
452     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
454 wakaba 1.1
455     $self->{escape} = 1; # unless $self->{escape};
456     $self->{s_kwd} = '--';
457     #
458 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
459 wakaba 1.1
460     $self->{s_kwd} = '--';
461     #
462 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463    
464     $self->{s_kwd} .= '-';
465     #
466 wakaba 1.1 } else {
467    
468 wakaba 1.5 $self->{s_kwd} = '-';
469 wakaba 1.1 #
470     }
471     }
472    
473     #
474     } elsif ($self->{nc} == 0x0021) { # !
475     if (length $self->{s_kwd}) {
476    
477     $self->{s_kwd} .= '!';
478     #
479     } else {
480    
481     #$self->{s_kwd} = '';
482     #
483     }
484     #
485     } elsif ($self->{nc} == 0x003C) { # <
486     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488     not $self->{escape})) {
489    
490     $self->{state} = TAG_OPEN_STATE;
491    
492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493     $self->{line_prev} = $self->{line};
494     $self->{column_prev} = $self->{column};
495     $self->{column}++;
496     $self->{nc}
497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498     } else {
499     $self->{set_nc}->($self);
500     }
501    
502     redo A;
503     } else {
504    
505     $self->{s_kwd} = '';
506     #
507     }
508     } elsif ($self->{nc} == 0x003E) { # >
509     if ($self->{escape} and
510     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511     if ($self->{s_kwd} eq '--') {
512    
513     delete $self->{escape};
514 wakaba 1.5 #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520    
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522     line => $self->{line_prev},
523     column => $self->{column_prev} - 1);
524     #
525 wakaba 1.1 } else {
526    
527 wakaba 1.5 #
528 wakaba 1.1 }
529    
530     $self->{s_kwd} = '';
531     #
532 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
533     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534    
535     $self->{s_kwd} .= ']';
536     } elsif ($self->{s_kwd} eq ']]') {
537    
538     #
539     } else {
540    
541     $self->{s_kwd} = '';
542     }
543     #
544 wakaba 1.1 } elsif ($self->{nc} == -1) {
545    
546     $self->{s_kwd} = '';
547     return ({type => END_OF_FILE_TOKEN,
548     line => $self->{line}, column => $self->{column}});
549     last A; ## TODO: ok?
550     } else {
551    
552     $self->{s_kwd} = '';
553     #
554     }
555    
556     # Anything else
557     my $token = {type => CHARACTER_TOKEN,
558     data => chr $self->{nc},
559     line => $self->{line}, column => $self->{column},
560     };
561 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 wakaba 1.1 length $token->{data})) {
563     $self->{s_kwd} = '';
564     }
565    
566     ## Stay in the data state.
567 wakaba 1.5 if (not $self->{is_xml} and
568     $self->{content_model} == PCDATA_CONTENT_MODEL) {
569 wakaba 1.1
570     $self->{state} = PCDATA_STATE;
571     } else {
572    
573     ## Stay in the state.
574     }
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     return ($token);
587     redo A;
588     } elsif ($self->{state} == TAG_OPEN_STATE) {
589 wakaba 1.10 ## XML5: "tag state".
590    
591 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592     if ($self->{nc} == 0x002F) { # /
593    
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     $self->{state} = CLOSE_TAG_OPEN_STATE;
606     redo A;
607     } elsif ($self->{nc} == 0x0021) { # !
608    
609 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 wakaba 1.1 #
611     } else {
612    
613 wakaba 1.12 $self->{s_kwd} = '';
614 wakaba 1.1 #
615     }
616    
617     ## reconsume
618     $self->{state} = DATA_STATE;
619     return ({type => CHARACTER_TOKEN, data => '<',
620     line => $self->{line_prev},
621     column => $self->{column_prev},
622     });
623     redo A;
624     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625     if ($self->{nc} == 0x0021) { # !
626    
627     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x002F) { # /
641    
642     $self->{state} = CLOSE_TAG_OPEN_STATE;
643    
644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645     $self->{line_prev} = $self->{line};
646     $self->{column_prev} = $self->{column};
647     $self->{column}++;
648     $self->{nc}
649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650     } else {
651     $self->{set_nc}->($self);
652     }
653    
654     redo A;
655     } elsif (0x0041 <= $self->{nc} and
656     $self->{nc} <= 0x005A) { # A..Z
657    
658     $self->{ct}
659     = {type => START_TAG_TOKEN,
660 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 wakaba 1.1 line => $self->{line_prev},
662     column => $self->{column_prev}};
663     $self->{state} = TAG_NAME_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } elsif (0x0061 <= $self->{nc} and
677     $self->{nc} <= 0x007A) { # a..z
678    
679     $self->{ct} = {type => START_TAG_TOKEN,
680     tag_name => chr ($self->{nc}),
681     line => $self->{line_prev},
682     column => $self->{column_prev}};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699     line => $self->{line_prev},
700     column => $self->{column_prev});
701     $self->{state} = DATA_STATE;
702 wakaba 1.5 $self->{s_kwd} = '';
703 wakaba 1.1
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714    
715     return ({type => CHARACTER_TOKEN, data => '<>',
716     line => $self->{line_prev},
717     column => $self->{column_prev},
718     });
719    
720     redo A;
721     } elsif ($self->{nc} == 0x003F) { # ?
722 wakaba 1.8 if ($self->{is_xml}) {
723    
724     $self->{state} = PI_STATE;
725    
726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727     $self->{line_prev} = $self->{line};
728     $self->{column_prev} = $self->{column};
729     $self->{column}++;
730     $self->{nc}
731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732     } else {
733     $self->{set_nc}->($self);
734     }
735    
736     redo A;
737     } else {
738    
739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740     line => $self->{line_prev},
741     column => $self->{column_prev});
742     $self->{state} = BOGUS_COMMENT_STATE;
743     $self->{ct} = {type => COMMENT_TOKEN, data => '',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     };
747     ## $self->{nc} is intentionally left as is
748     redo A;
749     }
750 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751 wakaba 1.1
752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753     line => $self->{line_prev},
754     column => $self->{column_prev});
755     $self->{state} = DATA_STATE;
756 wakaba 1.5 $self->{s_kwd} = '';
757 wakaba 1.1 ## reconsume
758    
759     return ({type => CHARACTER_TOKEN, data => '<',
760     line => $self->{line_prev},
761     column => $self->{column_prev},
762     });
763    
764     redo A;
765 wakaba 1.9 } else {
766     ## XML5: "<:" is a parse error.
767    
768     $self->{ct} = {type => START_TAG_TOKEN,
769     tag_name => chr ($self->{nc}),
770     line => $self->{line_prev},
771     column => $self->{column_prev}};
772     $self->{state} = TAG_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785 wakaba 1.1 }
786     } else {
787     die "$0: $self->{content_model} in tag open";
788     }
789     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790     ## NOTE: The "close tag open state" in the spec is implemented as
791     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793 wakaba 1.10 ## XML5: "end tag state".
794    
795 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797     if (defined $self->{last_stag_name}) {
798     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 wakaba 1.12 $self->{kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     redo A;
802     } else {
803     ## No start tag token has ever been emitted
804     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805    
806     $self->{state} = DATA_STATE;
807 wakaba 1.5 $self->{s_kwd} = '';
808 wakaba 1.1 ## Reconsume.
809     return ({type => CHARACTER_TOKEN, data => '</',
810     line => $l, column => $c,
811     });
812     redo A;
813     }
814     }
815    
816     if (0x0041 <= $self->{nc} and
817     $self->{nc} <= 0x005A) { # A..Z
818    
819     $self->{ct}
820     = {type => END_TAG_TOKEN,
821 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 wakaba 1.1 line => $l, column => $c};
823     $self->{state} = TAG_NAME_STATE;
824    
825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826     $self->{line_prev} = $self->{line};
827     $self->{column_prev} = $self->{column};
828     $self->{column}++;
829     $self->{nc}
830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831     } else {
832     $self->{set_nc}->($self);
833     }
834    
835     redo A;
836     } elsif (0x0061 <= $self->{nc} and
837     $self->{nc} <= 0x007A) { # a..z
838    
839     $self->{ct} = {type => END_TAG_TOKEN,
840     tag_name => chr ($self->{nc}),
841     line => $l, column => $c};
842     $self->{state} = TAG_NAME_STATE;
843    
844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845     $self->{line_prev} = $self->{line};
846     $self->{column_prev} = $self->{column};
847     $self->{column}++;
848     $self->{nc}
849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850     } else {
851     $self->{set_nc}->($self);
852     }
853    
854     redo A;
855     } elsif ($self->{nc} == 0x003E) { # >
856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857     line => $self->{line_prev}, ## "<" in "</>"
858     column => $self->{column_prev} - 1);
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.10 if ($self->{is_xml}) {
862    
863     ## XML5: No parse error.
864    
865     ## NOTE: This parser raises a parse error, since it supports
866     ## XML1, not XML5.
867    
868     ## NOTE: A short end tag token.
869     my $ct = {type => END_TAG_TOKEN,
870     tag_name => '',
871     line => $self->{line_prev},
872     column => $self->{column_prev} - 1,
873     };
874    
875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876     $self->{line_prev} = $self->{line};
877     $self->{column_prev} = $self->{column};
878     $self->{column}++;
879     $self->{nc}
880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881     } else {
882     $self->{set_nc}->($self);
883     }
884    
885     return ($ct);
886     } else {
887    
888    
889 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890     $self->{line_prev} = $self->{line};
891     $self->{column_prev} = $self->{column};
892     $self->{column}++;
893     $self->{nc}
894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895     } else {
896     $self->{set_nc}->($self);
897     }
898    
899 wakaba 1.10 }
900 wakaba 1.1 redo A;
901     } elsif ($self->{nc} == -1) {
902    
903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 wakaba 1.5 $self->{s_kwd} = '';
905 wakaba 1.1 $self->{state} = DATA_STATE;
906     # reconsume
907    
908     return ({type => CHARACTER_TOKEN, data => '</',
909     line => $l, column => $c,
910     });
911    
912     redo A;
913 wakaba 1.10 } elsif (not $self->{is_xml} or
914     $is_space->{$self->{nc}}) {
915 wakaba 1.1
916 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917     line => $self->{line_prev}, # "<" of "</"
918     column => $self->{column_prev} - 1);
919 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
920     $self->{ct} = {type => COMMENT_TOKEN, data => '',
921     line => $self->{line_prev}, # "<" of "</"
922     column => $self->{column_prev} - 1,
923     };
924     ## NOTE: $self->{nc} is intentionally left as is.
925     ## Although the "anything else" case of the spec not explicitly
926     ## states that the next input character is to be reconsumed,
927     ## it will be included to the |data| of the comment token
928     ## generated from the bogus end tag, as defined in the
929     ## "bogus comment state" entry.
930     redo A;
931 wakaba 1.10 } else {
932     ## XML5: "</:" is a parse error.
933    
934     $self->{ct} = {type => END_TAG_TOKEN,
935     tag_name => chr ($self->{nc}),
936     line => $l, column => $c};
937     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950 wakaba 1.1 }
951     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 wakaba 1.1 if (length $ch) {
954     my $CH = $ch;
955     $ch =~ tr/a-z/A-Z/;
956     my $nch = chr $self->{nc};
957     if ($nch eq $ch or $nch eq $CH) {
958    
959     ## Stay in the state.
960 wakaba 1.12 $self->{kwd} .= $nch;
961 wakaba 1.1
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } else {
974    
975     $self->{state} = DATA_STATE;
976 wakaba 1.5 $self->{s_kwd} = '';
977 wakaba 1.1 ## Reconsume.
978     return ({type => CHARACTER_TOKEN,
979 wakaba 1.12 data => '</' . $self->{kwd},
980 wakaba 1.1 line => $self->{line_prev},
981 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
982 wakaba 1.1 });
983     redo A;
984     }
985     } else { # after "<{tag-name}"
986     unless ($is_space->{$self->{nc}} or
987     {
988     0x003E => 1, # >
989     0x002F => 1, # /
990     -1 => 1, # EOF
991     }->{$self->{nc}}) {
992    
993     ## Reconsume.
994     $self->{state} = DATA_STATE;
995 wakaba 1.5 $self->{s_kwd} = '';
996 wakaba 1.1 return ({type => CHARACTER_TOKEN,
997 wakaba 1.12 data => '</' . $self->{kwd},
998 wakaba 1.1 line => $self->{line_prev},
999 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 wakaba 1.1 });
1001     redo A;
1002     } else {
1003    
1004     $self->{ct}
1005     = {type => END_TAG_TOKEN,
1006     tag_name => $self->{last_stag_name},
1007     line => $self->{line_prev},
1008 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1010     ## Reconsume.
1011     redo A;
1012     }
1013     }
1014     } elsif ($self->{state} == TAG_NAME_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016    
1017     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x003E) { # >
1031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032    
1033     $self->{last_stag_name} = $self->{ct}->{tag_name};
1034     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036     #if ($self->{ct}->{attributes}) {
1037     # ## NOTE: This should never be reached.
1038     # !!! cp (36);
1039     # !!! parse-error (type => 'end tag attribute');
1040     #} else {
1041    
1042     #}
1043     } else {
1044     die "$0: $self->{ct}->{type}: Unknown token type";
1045     }
1046     $self->{state} = DATA_STATE;
1047 wakaba 1.5 $self->{s_kwd} = '';
1048 wakaba 1.1
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059    
1060     return ($self->{ct}); # start tag or end tag
1061    
1062     redo A;
1063     } elsif (0x0041 <= $self->{nc} and
1064     $self->{nc} <= 0x005A) { # A..Z
1065    
1066 wakaba 1.4 $self->{ct}->{tag_name}
1067     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 wakaba 1.1 # start tag or end tag
1069     ## Stay in this state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     } elsif ($self->{nc} == -1) {
1083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085    
1086     $self->{last_stag_name} = $self->{ct}->{tag_name};
1087     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089     #if ($self->{ct}->{attributes}) {
1090     # ## NOTE: This state should never be reached.
1091     # !!! cp (40);
1092     # !!! parse-error (type => 'end tag attribute');
1093     #} else {
1094    
1095     #}
1096     } else {
1097     die "$0: $self->{ct}->{type}: Unknown token type";
1098     }
1099     $self->{state} = DATA_STATE;
1100 wakaba 1.5 $self->{s_kwd} = '';
1101 wakaba 1.1 # reconsume
1102    
1103     return ($self->{ct}); # start tag or end tag
1104    
1105     redo A;
1106     } elsif ($self->{nc} == 0x002F) { # /
1107    
1108     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     } else {
1122    
1123     $self->{ct}->{tag_name} .= chr $self->{nc};
1124     # start tag or end tag
1125     ## Stay in the state
1126    
1127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128     $self->{line_prev} = $self->{line};
1129     $self->{column_prev} = $self->{column};
1130     $self->{column}++;
1131     $self->{nc}
1132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133     } else {
1134     $self->{set_nc}->($self);
1135     }
1136    
1137     redo A;
1138     }
1139     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 wakaba 1.11 ## XML5: "Tag attribute name before state".
1141    
1142 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1143    
1144     ## Stay in the state
1145    
1146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147     $self->{line_prev} = $self->{line};
1148     $self->{column_prev} = $self->{column};
1149     $self->{column}++;
1150     $self->{nc}
1151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152     } else {
1153     $self->{set_nc}->($self);
1154     }
1155    
1156     redo A;
1157     } elsif ($self->{nc} == 0x003E) { # >
1158     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159    
1160     $self->{last_stag_name} = $self->{ct}->{tag_name};
1161     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163     if ($self->{ct}->{attributes}) {
1164    
1165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166     } else {
1167    
1168     }
1169     } else {
1170     die "$0: $self->{ct}->{type}: Unknown token type";
1171     }
1172     $self->{state} = DATA_STATE;
1173 wakaba 1.5 $self->{s_kwd} = '';
1174 wakaba 1.1
1175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176     $self->{line_prev} = $self->{line};
1177     $self->{column_prev} = $self->{column};
1178     $self->{column}++;
1179     $self->{nc}
1180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181     } else {
1182     $self->{set_nc}->($self);
1183     }
1184    
1185    
1186     return ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } elsif (0x0041 <= $self->{nc} and
1190     $self->{nc} <= 0x005A) { # A..Z
1191    
1192     $self->{ca}
1193 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 wakaba 1.1 value => '',
1195     line => $self->{line}, column => $self->{column}};
1196     $self->{state} = ATTRIBUTE_NAME_STATE;
1197    
1198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199     $self->{line_prev} = $self->{line};
1200     $self->{column_prev} = $self->{column};
1201     $self->{column}++;
1202     $self->{nc}
1203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204     } else {
1205     $self->{set_nc}->($self);
1206     }
1207    
1208     redo A;
1209     } elsif ($self->{nc} == 0x002F) { # /
1210    
1211     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212    
1213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214     $self->{line_prev} = $self->{line};
1215     $self->{column_prev} = $self->{column};
1216     $self->{column}++;
1217     $self->{nc}
1218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219     } else {
1220     $self->{set_nc}->($self);
1221     }
1222    
1223     redo A;
1224     } elsif ($self->{nc} == -1) {
1225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227    
1228     $self->{last_stag_name} = $self->{ct}->{tag_name};
1229     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231     if ($self->{ct}->{attributes}) {
1232    
1233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234     } else {
1235    
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 # reconsume
1243    
1244     return ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     if ({
1249     0x0022 => 1, # "
1250     0x0027 => 1, # '
1251     0x003D => 1, # =
1252     }->{$self->{nc}}) {
1253    
1254 wakaba 1.11 ## XML5: Not a parse error.
1255 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256     } else {
1257    
1258 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1259 wakaba 1.1 }
1260     $self->{ca}
1261     = {name => chr ($self->{nc}),
1262     value => '',
1263     line => $self->{line}, column => $self->{column}};
1264     $self->{state} = ATTRIBUTE_NAME_STATE;
1265    
1266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267     $self->{line_prev} = $self->{line};
1268     $self->{column_prev} = $self->{column};
1269     $self->{column}++;
1270     $self->{nc}
1271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272     } else {
1273     $self->{set_nc}->($self);
1274     }
1275    
1276     redo A;
1277     }
1278     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 wakaba 1.11 ## XML5: "Tag attribute name state".
1280    
1281 wakaba 1.1 my $before_leave = sub {
1282     if (exists $self->{ct}->{attributes} # start tag or end tag
1283     ->{$self->{ca}->{name}}) { # MUST
1284    
1285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286     ## Discard $self->{ca} # MUST
1287     } else {
1288    
1289     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290     = $self->{ca};
1291 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 wakaba 1.1 }
1293     }; # $before_leave
1294    
1295     if ($is_space->{$self->{nc}}) {
1296    
1297     $before_leave->();
1298     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003D) { # =
1312    
1313     $before_leave->();
1314     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315    
1316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317     $self->{line_prev} = $self->{line};
1318     $self->{column_prev} = $self->{column};
1319     $self->{column}++;
1320     $self->{nc}
1321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322     } else {
1323     $self->{set_nc}->($self);
1324     }
1325    
1326     redo A;
1327     } elsif ($self->{nc} == 0x003E) { # >
1328 wakaba 1.11 if ($self->{is_xml}) {
1329    
1330     ## XML5: Not a parse error.
1331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332     } else {
1333    
1334     }
1335    
1336 wakaba 1.1 $before_leave->();
1337     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338    
1339     $self->{last_stag_name} = $self->{ct}->{tag_name};
1340     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341    
1342     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343     if ($self->{ct}->{attributes}) {
1344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1
1352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353     $self->{line_prev} = $self->{line};
1354     $self->{column_prev} = $self->{column};
1355     $self->{column}++;
1356     $self->{nc}
1357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358     } else {
1359     $self->{set_nc}->($self);
1360     }
1361    
1362    
1363     return ($self->{ct}); # start tag or end tag
1364    
1365     redo A;
1366     } elsif (0x0041 <= $self->{nc} and
1367     $self->{nc} <= 0x005A) { # A..Z
1368    
1369 wakaba 1.4 $self->{ca}->{name}
1370     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 wakaba 1.1 ## Stay in the state
1372    
1373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374     $self->{line_prev} = $self->{line};
1375     $self->{column_prev} = $self->{column};
1376     $self->{column}++;
1377     $self->{nc}
1378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379     } else {
1380     $self->{set_nc}->($self);
1381     }
1382    
1383     redo A;
1384     } elsif ($self->{nc} == 0x002F) { # /
1385 wakaba 1.11 if ($self->{is_xml}) {
1386    
1387     ## XML5: Not a parse error.
1388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389     } else {
1390    
1391     }
1392 wakaba 1.1
1393     $before_leave->();
1394     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395    
1396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397     $self->{line_prev} = $self->{line};
1398     $self->{column_prev} = $self->{column};
1399     $self->{column}++;
1400     $self->{nc}
1401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402     } else {
1403     $self->{set_nc}->($self);
1404     }
1405    
1406     redo A;
1407     } elsif ($self->{nc} == -1) {
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409     $before_leave->();
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411    
1412     $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415     if ($self->{ct}->{attributes}) {
1416    
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418     } else {
1419     ## NOTE: This state should never be reached.
1420    
1421     }
1422     } else {
1423     die "$0: $self->{ct}->{type}: Unknown token type";
1424     }
1425     $self->{state} = DATA_STATE;
1426 wakaba 1.5 $self->{s_kwd} = '';
1427 wakaba 1.1 # reconsume
1428    
1429     return ($self->{ct}); # start tag or end tag
1430    
1431     redo A;
1432     } else {
1433     if ($self->{nc} == 0x0022 or # "
1434     $self->{nc} == 0x0027) { # '
1435    
1436 wakaba 1.11 ## XML5: Not a parse error.
1437 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{name} .= chr ($self->{nc});
1442     ## Stay in the state
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 wakaba 1.11 ## XML5: "Tag attribute name after state".
1458    
1459 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1460    
1461     ## Stay in the state
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x003D) { # =
1475    
1476     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x003E) { # >
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497    
1498 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526    
1527     return ($self->{ct}); # start tag or end tag
1528    
1529     redo A;
1530     } elsif (0x0041 <= $self->{nc} and
1531     $self->{nc} <= 0x005A) { # A..Z
1532    
1533     $self->{ca}
1534 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 wakaba 1.1 value => '',
1536     line => $self->{line}, column => $self->{column}};
1537     $self->{state} = ATTRIBUTE_NAME_STATE;
1538    
1539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540     $self->{line_prev} = $self->{line};
1541     $self->{column_prev} = $self->{column};
1542     $self->{column}++;
1543     $self->{nc}
1544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545     } else {
1546     $self->{set_nc}->($self);
1547     }
1548    
1549     redo A;
1550     } elsif ($self->{nc} == 0x002F) { # /
1551 wakaba 1.11 if ($self->{is_xml}) {
1552    
1553     ## XML5: Not a parse error.
1554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555     } else {
1556    
1557     }
1558 wakaba 1.1
1559     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589 wakaba 1.5 $self->{s_kwd} = '';
1590 wakaba 1.1 $self->{state} = DATA_STATE;
1591     # reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597 wakaba 1.11 if ($self->{is_xml}) {
1598    
1599     ## XML5: Not a parse error.
1600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601     } else {
1602    
1603     }
1604    
1605 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1606     $self->{nc} == 0x0027) { # '
1607    
1608 wakaba 1.11 ## XML5: Not a parse error.
1609 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610     } else {
1611    
1612     }
1613     $self->{ca}
1614     = {name => chr ($self->{nc}),
1615     value => '',
1616     line => $self->{line}, column => $self->{column}};
1617     $self->{state} = ATTRIBUTE_NAME_STATE;
1618    
1619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620     $self->{line_prev} = $self->{line};
1621     $self->{column_prev} = $self->{column};
1622     $self->{column}++;
1623     $self->{nc}
1624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625     } else {
1626     $self->{set_nc}->($self);
1627     }
1628    
1629     redo A;
1630     }
1631     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 wakaba 1.11 ## XML5: "Tag attribute value before state".
1633    
1634 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1635    
1636     ## Stay in the state
1637    
1638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639     $self->{line_prev} = $self->{line};
1640     $self->{column_prev} = $self->{column};
1641     $self->{column}++;
1642     $self->{nc}
1643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644     } else {
1645     $self->{set_nc}->($self);
1646     }
1647    
1648     redo A;
1649     } elsif ($self->{nc} == 0x0022) { # "
1650    
1651     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652    
1653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654     $self->{line_prev} = $self->{line};
1655     $self->{column_prev} = $self->{column};
1656     $self->{column}++;
1657     $self->{nc}
1658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659     } else {
1660     $self->{set_nc}->($self);
1661     }
1662    
1663     redo A;
1664     } elsif ($self->{nc} == 0x0026) { # &
1665    
1666     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667     ## reconsume
1668     redo A;
1669     } elsif ($self->{nc} == 0x0027) { # '
1670    
1671     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683     redo A;
1684     } elsif ($self->{nc} == 0x003E) { # >
1685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687    
1688     $self->{last_stag_name} = $self->{ct}->{tag_name};
1689     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691     if ($self->{ct}->{attributes}) {
1692    
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694     } else {
1695     ## NOTE: This state should never be reached.
1696    
1697     }
1698     } else {
1699     die "$0: $self->{ct}->{type}: Unknown token type";
1700     }
1701     $self->{state} = DATA_STATE;
1702 wakaba 1.5 $self->{s_kwd} = '';
1703 wakaba 1.1
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714    
1715     return ($self->{ct}); # start tag or end tag
1716    
1717     redo A;
1718     } elsif ($self->{nc} == -1) {
1719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721    
1722     $self->{last_stag_name} = $self->{ct}->{tag_name};
1723     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725     if ($self->{ct}->{attributes}) {
1726    
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728     } else {
1729     ## NOTE: This state should never be reached.
1730    
1731     }
1732     } else {
1733     die "$0: $self->{ct}->{type}: Unknown token type";
1734     }
1735     $self->{state} = DATA_STATE;
1736 wakaba 1.5 $self->{s_kwd} = '';
1737 wakaba 1.1 ## reconsume
1738    
1739     return ($self->{ct}); # start tag or end tag
1740    
1741     redo A;
1742     } else {
1743 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744 wakaba 1.1
1745 wakaba 1.11 ## XML5: Not a parse error.
1746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 wakaba 1.11 } elsif ($self->{is_xml}) {
1748    
1749     ## XML5: No parse error.
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 wakaba 1.1 } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756    
1757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758     $self->{line_prev} = $self->{line};
1759     $self->{column_prev} = $self->{column};
1760     $self->{column}++;
1761     $self->{nc}
1762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763     } else {
1764     $self->{set_nc}->($self);
1765     }
1766    
1767     redo A;
1768     }
1769     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771     ## ATTLIST attribute value double quoted state".
1772 wakaba 1.11
1773 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1774 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775    
1776     ## XML5: "DOCTYPE ATTLIST name after state".
1777     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779     } else {
1780    
1781     ## XML5: "Tag attribute name before state".
1782     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783     }
1784 wakaba 1.1
1785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786     $self->{line_prev} = $self->{line};
1787     $self->{column_prev} = $self->{column};
1788     $self->{column}++;
1789     $self->{nc}
1790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791     } else {
1792     $self->{set_nc}->($self);
1793     }
1794    
1795     redo A;
1796     } elsif ($self->{nc} == 0x0026) { # &
1797    
1798 wakaba 1.11 ## XML5: Not defined yet.
1799    
1800 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1801     ## "entity in attribute value state". In this implementation, the
1802     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803     ## implementation of the "consume a character reference" algorithm.
1804     $self->{prev_state} = $self->{state};
1805     $self->{entity_add} = 0x0022; # "
1806     $self->{state} = ENTITY_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818     redo A;
1819 wakaba 1.25 } elsif ($self->{is_xml} and
1820     $is_space->{$self->{nc}}) {
1821    
1822     $self->{ca}->{value} .= ' ';
1823     ## Stay in the state.
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836 wakaba 1.1 } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 wakaba 1.15
1842     $self->{state} = DATA_STATE;
1843     $self->{s_kwd} = '';
1844     ## reconsume
1845     return ($self->{ct}); # start tag
1846     redo A;
1847 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849     if ($self->{ct}->{attributes}) {
1850    
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852     } else {
1853     ## NOTE: This state should never be reached.
1854    
1855     }
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # end tag
1861     redo A;
1862     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863     ## XML5: No parse error above; not defined yet.
1864     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866     ## Reconsume.
1867     return ($self->{ct}); # ATTLIST
1868     redo A;
1869 wakaba 1.1 } else {
1870     die "$0: $self->{ct}->{type}: Unknown token type";
1871     }
1872     } else {
1873 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1874 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875    
1876     ## XML5: Not a parse error.
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878     } else {
1879    
1880     }
1881 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1882     $self->{read_until}->($self->{ca}->{value},
1883 wakaba 1.25 qq["&<\x09\x0C\x20],
1884 wakaba 1.1 length $self->{ca}->{value});
1885    
1886     ## Stay in the state
1887    
1888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889     $self->{line_prev} = $self->{line};
1890     $self->{column_prev} = $self->{column};
1891     $self->{column}++;
1892     $self->{nc}
1893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894     } else {
1895     $self->{set_nc}->($self);
1896     }
1897    
1898     redo A;
1899     }
1900     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902     ## ATTLIST attribute value single quoted state".
1903 wakaba 1.11
1904 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1905 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906    
1907     ## XML5: "DOCTYPE ATTLIST name after state".
1908     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910     } else {
1911    
1912     ## XML5: "Before attribute name state" (sic).
1913     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914     }
1915 wakaba 1.1
1916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917     $self->{line_prev} = $self->{line};
1918     $self->{column_prev} = $self->{column};
1919     $self->{column}++;
1920     $self->{nc}
1921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922     } else {
1923     $self->{set_nc}->($self);
1924     }
1925    
1926     redo A;
1927     } elsif ($self->{nc} == 0x0026) { # &
1928    
1929 wakaba 1.11 ## XML5: Not defined yet.
1930    
1931 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1932     ## "entity in attribute value state". In this implementation, the
1933     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934     ## implementation of the "consume a character reference" algorithm.
1935     $self->{entity_add} = 0x0027; # '
1936     $self->{prev_state} = $self->{state};
1937     $self->{state} = ENTITY_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949     redo A;
1950 wakaba 1.25 } elsif ($self->{is_xml} and
1951     $is_space->{$self->{nc}}) {
1952    
1953     $self->{ca}->{value} .= ' ';
1954     ## Stay in the state.
1955    
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966     redo A;
1967 wakaba 1.1 } elsif ($self->{nc} == -1) {
1968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970    
1971     $self->{last_stag_name} = $self->{ct}->{tag_name};
1972 wakaba 1.15
1973     $self->{state} = DATA_STATE;
1974     $self->{s_kwd} = '';
1975     ## reconsume
1976     return ($self->{ct}); # start tag
1977     redo A;
1978 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980     if ($self->{ct}->{attributes}) {
1981    
1982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983     } else {
1984     ## NOTE: This state should never be reached.
1985    
1986     }
1987 wakaba 1.15
1988     $self->{state} = DATA_STATE;
1989     $self->{s_kwd} = '';
1990     ## reconsume
1991     return ($self->{ct}); # end tag
1992     redo A;
1993     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994     ## XML5: No parse error above; not defined yet.
1995     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997     ## Reconsume.
1998     return ($self->{ct}); # ATTLIST
1999     redo A;
2000 wakaba 1.1 } else {
2001     die "$0: $self->{ct}->{type}: Unknown token type";
2002     }
2003     } else {
2004 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2005 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006    
2007     ## XML5: Not a parse error.
2008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009     } else {
2010    
2011     }
2012 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2013     $self->{read_until}->($self->{ca}->{value},
2014 wakaba 1.25 qq['&<\x09\x0C\x20],
2015 wakaba 1.1 length $self->{ca}->{value});
2016    
2017     ## Stay in the state
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     }
2031     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2033    
2034 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2035 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036    
2037     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039     } else {
2040    
2041     ## XML5: "Tag attribute name before state".
2042     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043     }
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055     redo A;
2056     } elsif ($self->{nc} == 0x0026) { # &
2057    
2058 wakaba 1.11
2059     ## XML5: Not defined yet.
2060    
2061 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2062     ## "entity in attribute value state". In this implementation, the
2063     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064     ## implementation of the "consume a character reference" algorithm.
2065     $self->{entity_add} = -1;
2066     $self->{prev_state} = $self->{state};
2067     $self->{state} = ENTITY_STATE;
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     redo A;
2080     } elsif ($self->{nc} == 0x003E) { # >
2081     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082    
2083     $self->{last_stag_name} = $self->{ct}->{tag_name};
2084 wakaba 1.15
2085     $self->{state} = DATA_STATE;
2086     $self->{s_kwd} = '';
2087    
2088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098     return ($self->{ct}); # start tag
2099     redo A;
2100 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102     if ($self->{ct}->{attributes}) {
2103    
2104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105     } else {
2106     ## NOTE: This state should never be reached.
2107    
2108     }
2109 wakaba 1.15
2110     $self->{state} = DATA_STATE;
2111     $self->{s_kwd} = '';
2112    
2113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114     $self->{line_prev} = $self->{line};
2115     $self->{column_prev} = $self->{column};
2116     $self->{column}++;
2117     $self->{nc}
2118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119     } else {
2120     $self->{set_nc}->($self);
2121     }
2122    
2123     return ($self->{ct}); # end tag
2124     redo A;
2125     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128    
2129 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130     $self->{line_prev} = $self->{line};
2131     $self->{column_prev} = $self->{column};
2132     $self->{column}++;
2133     $self->{nc}
2134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135     } else {
2136     $self->{set_nc}->($self);
2137     }
2138    
2139 wakaba 1.15 return ($self->{ct}); # ATTLIST
2140     redo A;
2141     } else {
2142     die "$0: $self->{ct}->{type}: Unknown token type";
2143     }
2144 wakaba 1.1 } elsif ($self->{nc} == -1) {
2145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146    
2147 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149 wakaba 1.15
2150     $self->{state} = DATA_STATE;
2151     $self->{s_kwd} = '';
2152     ## reconsume
2153     return ($self->{ct}); # start tag
2154     redo A;
2155 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158     if ($self->{ct}->{attributes}) {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161     } else {
2162     ## NOTE: This state should never be reached.
2163    
2164     }
2165 wakaba 1.15
2166     $self->{state} = DATA_STATE;
2167     $self->{s_kwd} = '';
2168     ## reconsume
2169     return ($self->{ct}); # end tag
2170     redo A;
2171     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175     ## Reconsume.
2176     return ($self->{ct}); # ATTLIST
2177     redo A;
2178 wakaba 1.1 } else {
2179     die "$0: $self->{ct}->{type}: Unknown token type";
2180     }
2181     } else {
2182     if ({
2183     0x0022 => 1, # "
2184     0x0027 => 1, # '
2185     0x003D => 1, # =
2186 wakaba 1.26 0x003C => 1, # <
2187 wakaba 1.1 }->{$self->{nc}}) {
2188    
2189 wakaba 1.11 ## XML5: Not a parse error.
2190 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2191     } else {
2192    
2193     }
2194     $self->{ca}->{value} .= chr ($self->{nc});
2195     $self->{read_until}->($self->{ca}->{value},
2196 wakaba 1.25 qq["'=& \x09\x0C>],
2197 wakaba 1.1 length $self->{ca}->{value});
2198    
2199     ## Stay in the state
2200    
2201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2202     $self->{line_prev} = $self->{line};
2203     $self->{column_prev} = $self->{column};
2204     $self->{column}++;
2205     $self->{nc}
2206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2207     } else {
2208     $self->{set_nc}->($self);
2209     }
2210    
2211     redo A;
2212     }
2213     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2214     if ($is_space->{$self->{nc}}) {
2215    
2216     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2217    
2218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219     $self->{line_prev} = $self->{line};
2220     $self->{column_prev} = $self->{column};
2221     $self->{column}++;
2222     $self->{nc}
2223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224     } else {
2225     $self->{set_nc}->($self);
2226     }
2227    
2228     redo A;
2229     } elsif ($self->{nc} == 0x003E) { # >
2230     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2231    
2232     $self->{last_stag_name} = $self->{ct}->{tag_name};
2233     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2234     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2235     if ($self->{ct}->{attributes}) {
2236    
2237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2238     } else {
2239     ## NOTE: This state should never be reached.
2240    
2241     }
2242     } else {
2243     die "$0: $self->{ct}->{type}: Unknown token type";
2244     }
2245     $self->{state} = DATA_STATE;
2246 wakaba 1.5 $self->{s_kwd} = '';
2247 wakaba 1.1
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258    
2259     return ($self->{ct}); # start tag or end tag
2260    
2261     redo A;
2262     } elsif ($self->{nc} == 0x002F) { # /
2263    
2264     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2265    
2266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2267     $self->{line_prev} = $self->{line};
2268     $self->{column_prev} = $self->{column};
2269     $self->{column}++;
2270     $self->{nc}
2271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2272     } else {
2273     $self->{set_nc}->($self);
2274     }
2275    
2276     redo A;
2277     } elsif ($self->{nc} == -1) {
2278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2279     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2280    
2281     $self->{last_stag_name} = $self->{ct}->{tag_name};
2282     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2283     if ($self->{ct}->{attributes}) {
2284    
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2286     } else {
2287     ## NOTE: This state should never be reached.
2288    
2289     }
2290     } else {
2291     die "$0: $self->{ct}->{type}: Unknown token type";
2292     }
2293     $self->{state} = DATA_STATE;
2294 wakaba 1.5 $self->{s_kwd} = '';
2295 wakaba 1.1 ## Reconsume.
2296     return ($self->{ct}); # start tag or end tag
2297     redo A;
2298     } else {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2301     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2302     ## reconsume
2303     redo A;
2304     }
2305     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2306 wakaba 1.11 ## XML5: "Empty tag state".
2307    
2308 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2309     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2312     ## TODO: Different type than slash in start tag
2313     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2314     if ($self->{ct}->{attributes}) {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317     } else {
2318    
2319     }
2320     ## TODO: Test |<title></title/>|
2321     } else {
2322    
2323     $self->{self_closing} = 1;
2324     }
2325    
2326     $self->{state} = DATA_STATE;
2327 wakaba 1.5 $self->{s_kwd} = '';
2328 wakaba 1.1
2329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2330     $self->{line_prev} = $self->{line};
2331     $self->{column_prev} = $self->{column};
2332     $self->{column}++;
2333     $self->{nc}
2334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2335     } else {
2336     $self->{set_nc}->($self);
2337     }
2338    
2339    
2340     return ($self->{ct}); # start tag or end tag
2341    
2342     redo A;
2343     } elsif ($self->{nc} == -1) {
2344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2345     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2346    
2347     $self->{last_stag_name} = $self->{ct}->{tag_name};
2348     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2349     if ($self->{ct}->{attributes}) {
2350    
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2352     } else {
2353     ## NOTE: This state should never be reached.
2354    
2355     }
2356     } else {
2357     die "$0: $self->{ct}->{type}: Unknown token type";
2358     }
2359 wakaba 1.11 ## XML5: "Tag attribute name before state".
2360 wakaba 1.1 $self->{state} = DATA_STATE;
2361 wakaba 1.5 $self->{s_kwd} = '';
2362 wakaba 1.1 ## Reconsume.
2363     return ($self->{ct}); # start tag or end tag
2364     redo A;
2365     } else {
2366    
2367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2368     ## TODO: This error type is wrong.
2369     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2370     ## Reconsume.
2371     redo A;
2372     }
2373     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2374 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2375    
2376 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2377     ## consumes characters one-by-one basis.
2378    
2379     if ($self->{nc} == 0x003E) { # >
2380 wakaba 1.13 if ($self->{in_subset}) {
2381    
2382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2383     } else {
2384    
2385     $self->{state} = DATA_STATE;
2386     $self->{s_kwd} = '';
2387     }
2388 wakaba 1.1
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399    
2400     return ($self->{ct}); # comment
2401     redo A;
2402     } elsif ($self->{nc} == -1) {
2403 wakaba 1.13 if ($self->{in_subset}) {
2404    
2405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2406     } else {
2407    
2408     $self->{state} = DATA_STATE;
2409     $self->{s_kwd} = '';
2410     }
2411 wakaba 1.1 ## reconsume
2412    
2413     return ($self->{ct}); # comment
2414     redo A;
2415     } else {
2416    
2417     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2418     $self->{read_until}->($self->{ct}->{data},
2419     q[>],
2420     length $self->{ct}->{data});
2421    
2422     ## Stay in the state.
2423    
2424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425     $self->{line_prev} = $self->{line};
2426     $self->{column_prev} = $self->{column};
2427     $self->{column}++;
2428     $self->{nc}
2429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430     } else {
2431     $self->{set_nc}->($self);
2432     }
2433    
2434     redo A;
2435     }
2436     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2437 wakaba 1.14 ## XML5: "Markup declaration state".
2438 wakaba 1.1
2439     if ($self->{nc} == 0x002D) { # -
2440    
2441     $self->{state} = MD_HYPHEN_STATE;
2442    
2443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2444     $self->{line_prev} = $self->{line};
2445     $self->{column_prev} = $self->{column};
2446     $self->{column}++;
2447     $self->{nc}
2448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2449     } else {
2450     $self->{set_nc}->($self);
2451     }
2452    
2453     redo A;
2454     } elsif ($self->{nc} == 0x0044 or # D
2455     $self->{nc} == 0x0064) { # d
2456     ## ASCII case-insensitive.
2457    
2458     $self->{state} = MD_DOCTYPE_STATE;
2459 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2460 wakaba 1.1
2461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2462     $self->{line_prev} = $self->{line};
2463     $self->{column_prev} = $self->{column};
2464     $self->{column}++;
2465     $self->{nc}
2466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2467     } else {
2468     $self->{set_nc}->($self);
2469     }
2470    
2471     redo A;
2472 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2473     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2474     $self->{is_xml}) and
2475 wakaba 1.1 $self->{nc} == 0x005B) { # [
2476    
2477     $self->{state} = MD_CDATA_STATE;
2478 wakaba 1.12 $self->{kwd} = '[';
2479 wakaba 1.1
2480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481     $self->{line_prev} = $self->{line};
2482     $self->{column_prev} = $self->{column};
2483     $self->{column}++;
2484     $self->{nc}
2485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486     } else {
2487     $self->{set_nc}->($self);
2488     }
2489    
2490     redo A;
2491     } else {
2492    
2493     }
2494    
2495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2496     line => $self->{line_prev},
2497     column => $self->{column_prev} - 1);
2498     ## Reconsume.
2499     $self->{state} = BOGUS_COMMENT_STATE;
2500     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2501     line => $self->{line_prev},
2502     column => $self->{column_prev} - 1,
2503     };
2504     redo A;
2505     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2506     if ($self->{nc} == 0x002D) { # -
2507    
2508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509     line => $self->{line_prev},
2510     column => $self->{column_prev} - 2,
2511     };
2512 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2513 wakaba 1.1
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } else {
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2528     line => $self->{line_prev},
2529     column => $self->{column_prev} - 2);
2530     $self->{state} = BOGUS_COMMENT_STATE;
2531     ## Reconsume.
2532     $self->{ct} = {type => COMMENT_TOKEN,
2533     data => '-',
2534     line => $self->{line_prev},
2535     column => $self->{column_prev} - 2,
2536     };
2537     redo A;
2538     }
2539     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2540     ## ASCII case-insensitive.
2541     if ($self->{nc} == [
2542     undef,
2543     0x004F, # O
2544     0x0043, # C
2545     0x0054, # T
2546     0x0059, # Y
2547     0x0050, # P
2548 wakaba 1.12 ]->[length $self->{kwd}] or
2549 wakaba 1.1 $self->{nc} == [
2550     undef,
2551     0x006F, # o
2552     0x0063, # c
2553     0x0074, # t
2554     0x0079, # y
2555     0x0070, # p
2556 wakaba 1.12 ]->[length $self->{kwd}]) {
2557 wakaba 1.1
2558     ## Stay in the state.
2559 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2560 wakaba 1.1
2561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562     $self->{line_prev} = $self->{line};
2563     $self->{column_prev} = $self->{column};
2564     $self->{column}++;
2565     $self->{nc}
2566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567     } else {
2568     $self->{set_nc}->($self);
2569     }
2570    
2571     redo A;
2572 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2573 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2574     $self->{nc} == 0x0065)) { # e
2575 wakaba 1.12 if ($self->{is_xml} and
2576     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2577 wakaba 1.10
2578     ## XML5: case-sensitive.
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2580     text => 'DOCTYPE',
2581     line => $self->{line_prev},
2582     column => $self->{column_prev} - 5);
2583     } else {
2584    
2585     }
2586 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2587     $self->{ct} = {type => DOCTYPE_TOKEN,
2588     quirks => 1,
2589     line => $self->{line_prev},
2590     column => $self->{column_prev} - 7,
2591     };
2592    
2593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2594     $self->{line_prev} = $self->{line};
2595     $self->{column_prev} = $self->{column};
2596     $self->{column}++;
2597     $self->{nc}
2598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2599     } else {
2600     $self->{set_nc}->($self);
2601     }
2602    
2603     redo A;
2604     } else {
2605    
2606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2607     line => $self->{line_prev},
2608 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2609 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2610     ## Reconsume.
2611     $self->{ct} = {type => COMMENT_TOKEN,
2612 wakaba 1.12 data => $self->{kwd},
2613 wakaba 1.1 line => $self->{line_prev},
2614 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2615 wakaba 1.1 };
2616     redo A;
2617     }
2618     } elsif ($self->{state} == MD_CDATA_STATE) {
2619     if ($self->{nc} == {
2620     '[' => 0x0043, # C
2621     '[C' => 0x0044, # D
2622     '[CD' => 0x0041, # A
2623     '[CDA' => 0x0054, # T
2624     '[CDAT' => 0x0041, # A
2625 wakaba 1.12 }->{$self->{kwd}}) {
2626 wakaba 1.1
2627     ## Stay in the state.
2628 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2629 wakaba 1.1
2630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631     $self->{line_prev} = $self->{line};
2632     $self->{column_prev} = $self->{column};
2633     $self->{column}++;
2634     $self->{nc}
2635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2636     } else {
2637     $self->{set_nc}->($self);
2638     }
2639    
2640     redo A;
2641 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2642 wakaba 1.1 $self->{nc} == 0x005B) { # [
2643 wakaba 1.6 if ($self->{is_xml} and
2644     not $self->{tainted} and
2645     @{$self->{open_elements} or []} == 0) {
2646 wakaba 1.8
2647 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2648     line => $self->{line_prev},
2649     column => $self->{column_prev} - 7);
2650     $self->{tainted} = 1;
2651 wakaba 1.8 } else {
2652    
2653 wakaba 1.6 }
2654    
2655 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2656     data => '',
2657     line => $self->{line_prev},
2658     column => $self->{column_prev} - 7};
2659     $self->{state} = CDATA_SECTION_STATE;
2660    
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671     redo A;
2672     } else {
2673    
2674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2675     line => $self->{line_prev},
2676 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2677 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2678     ## Reconsume.
2679     $self->{ct} = {type => COMMENT_TOKEN,
2680 wakaba 1.12 data => $self->{kwd},
2681 wakaba 1.1 line => $self->{line_prev},
2682 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2683 wakaba 1.1 };
2684     redo A;
2685     }
2686     } elsif ($self->{state} == COMMENT_START_STATE) {
2687     if ($self->{nc} == 0x002D) { # -
2688    
2689     $self->{state} = COMMENT_START_DASH_STATE;
2690    
2691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2692     $self->{line_prev} = $self->{line};
2693     $self->{column_prev} = $self->{column};
2694     $self->{column}++;
2695     $self->{nc}
2696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2697     } else {
2698     $self->{set_nc}->($self);
2699     }
2700    
2701     redo A;
2702     } elsif ($self->{nc} == 0x003E) { # >
2703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2704 wakaba 1.13 if ($self->{in_subset}) {
2705    
2706     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707     } else {
2708    
2709     $self->{state} = DATA_STATE;
2710     $self->{s_kwd} = '';
2711     }
2712 wakaba 1.1
2713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714     $self->{line_prev} = $self->{line};
2715     $self->{column_prev} = $self->{column};
2716     $self->{column}++;
2717     $self->{nc}
2718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719     } else {
2720     $self->{set_nc}->($self);
2721     }
2722    
2723    
2724     return ($self->{ct}); # comment
2725    
2726     redo A;
2727     } elsif ($self->{nc} == -1) {
2728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729 wakaba 1.13 if ($self->{in_subset}) {
2730    
2731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732     } else {
2733    
2734     $self->{state} = DATA_STATE;
2735     $self->{s_kwd} = '';
2736     }
2737 wakaba 1.1 ## reconsume
2738    
2739     return ($self->{ct}); # comment
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{ct}->{data} # comment
2745     .= chr ($self->{nc});
2746     $self->{state} = COMMENT_STATE;
2747    
2748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749     $self->{line_prev} = $self->{line};
2750     $self->{column_prev} = $self->{column};
2751     $self->{column}++;
2752     $self->{nc}
2753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754     } else {
2755     $self->{set_nc}->($self);
2756     }
2757    
2758     redo A;
2759     }
2760     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2761     if ($self->{nc} == 0x002D) { # -
2762    
2763     $self->{state} = COMMENT_END_STATE;
2764    
2765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2766     $self->{line_prev} = $self->{line};
2767     $self->{column_prev} = $self->{column};
2768     $self->{column}++;
2769     $self->{nc}
2770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2771     } else {
2772     $self->{set_nc}->($self);
2773     }
2774    
2775     redo A;
2776     } elsif ($self->{nc} == 0x003E) { # >
2777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2778 wakaba 1.13 if ($self->{in_subset}) {
2779    
2780     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781     } else {
2782    
2783     $self->{state} = DATA_STATE;
2784     $self->{s_kwd} = '';
2785     }
2786 wakaba 1.1
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797    
2798     return ($self->{ct}); # comment
2799    
2800     redo A;
2801     } elsif ($self->{nc} == -1) {
2802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 wakaba 1.13 if ($self->{in_subset}) {
2804    
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     } else {
2807    
2808     $self->{state} = DATA_STATE;
2809     $self->{s_kwd} = '';
2810     }
2811 wakaba 1.1 ## reconsume
2812    
2813     return ($self->{ct}); # comment
2814    
2815     redo A;
2816     } else {
2817    
2818     $self->{ct}->{data} # comment
2819     .= '-' . chr ($self->{nc});
2820     $self->{state} = COMMENT_STATE;
2821    
2822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2823     $self->{line_prev} = $self->{line};
2824     $self->{column_prev} = $self->{column};
2825     $self->{column}++;
2826     $self->{nc}
2827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2828     } else {
2829     $self->{set_nc}->($self);
2830     }
2831    
2832     redo A;
2833     }
2834     } elsif ($self->{state} == COMMENT_STATE) {
2835 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2836    
2837 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2838    
2839     $self->{state} = COMMENT_END_DASH_STATE;
2840    
2841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842     $self->{line_prev} = $self->{line};
2843     $self->{column_prev} = $self->{column};
2844     $self->{column}++;
2845     $self->{nc}
2846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847     } else {
2848     $self->{set_nc}->($self);
2849     }
2850    
2851     redo A;
2852     } elsif ($self->{nc} == -1) {
2853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2854 wakaba 1.13 if ($self->{in_subset}) {
2855    
2856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857     } else {
2858    
2859     $self->{state} = DATA_STATE;
2860     $self->{s_kwd} = '';
2861     }
2862 wakaba 1.1 ## reconsume
2863    
2864     return ($self->{ct}); # comment
2865    
2866     redo A;
2867     } else {
2868    
2869     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2870     $self->{read_until}->($self->{ct}->{data},
2871     q[-],
2872     length $self->{ct}->{data});
2873    
2874     ## Stay in the state
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     }
2888     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2889 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2890 wakaba 1.10
2891 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2892    
2893     $self->{state} = COMMENT_END_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == -1) {
2907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2908 wakaba 1.13 if ($self->{in_subset}) {
2909    
2910     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2911     } else {
2912    
2913     $self->{state} = DATA_STATE;
2914     $self->{s_kwd} = '';
2915     }
2916 wakaba 1.1 ## reconsume
2917    
2918     return ($self->{ct}); # comment
2919    
2920     redo A;
2921     } else {
2922    
2923     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2924     $self->{state} = COMMENT_STATE;
2925    
2926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2927     $self->{line_prev} = $self->{line};
2928     $self->{column_prev} = $self->{column};
2929     $self->{column}++;
2930     $self->{nc}
2931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2932     } else {
2933     $self->{set_nc}->($self);
2934     }
2935    
2936     redo A;
2937     }
2938     } elsif ($self->{state} == COMMENT_END_STATE) {
2939 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2940    
2941 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2942 wakaba 1.13 if ($self->{in_subset}) {
2943    
2944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945     } else {
2946    
2947     $self->{state} = DATA_STATE;
2948     $self->{s_kwd} = '';
2949     }
2950 wakaba 1.1
2951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952     $self->{line_prev} = $self->{line};
2953     $self->{column_prev} = $self->{column};
2954     $self->{column}++;
2955     $self->{nc}
2956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957     } else {
2958     $self->{set_nc}->($self);
2959     }
2960    
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } elsif ($self->{nc} == 0x002D) { # -
2966    
2967 wakaba 1.10 ## XML5: Not a parse error.
2968 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969     line => $self->{line_prev},
2970     column => $self->{column_prev});
2971     $self->{ct}->{data} .= '-'; # comment
2972     ## Stay in the state
2973    
2974     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975     $self->{line_prev} = $self->{line};
2976     $self->{column_prev} = $self->{column};
2977     $self->{column}++;
2978     $self->{nc}
2979     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980     } else {
2981     $self->{set_nc}->($self);
2982     }
2983    
2984     redo A;
2985     } elsif ($self->{nc} == -1) {
2986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2987 wakaba 1.13 if ($self->{in_subset}) {
2988    
2989     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2990     } else {
2991    
2992     $self->{state} = DATA_STATE;
2993     $self->{s_kwd} = '';
2994     }
2995 wakaba 1.1 ## reconsume
2996    
2997     return ($self->{ct}); # comment
2998    
2999     redo A;
3000     } else {
3001    
3002     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3003     $self->{state} = COMMENT_STATE;
3004    
3005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006     $self->{line_prev} = $self->{line};
3007     $self->{column_prev} = $self->{column};
3008     $self->{column}++;
3009     $self->{nc}
3010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011     } else {
3012     $self->{set_nc}->($self);
3013     }
3014    
3015     redo A;
3016     }
3017     } elsif ($self->{state} == DOCTYPE_STATE) {
3018     if ($is_space->{$self->{nc}}) {
3019    
3020     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3021    
3022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023     $self->{line_prev} = $self->{line};
3024     $self->{column_prev} = $self->{column};
3025     $self->{column}++;
3026     $self->{nc}
3027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028     } else {
3029     $self->{set_nc}->($self);
3030     }
3031    
3032     redo A;
3033     } else {
3034    
3035 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
3036 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3037     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3038     ## reconsume
3039     redo A;
3040     }
3041     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3042 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3043    
3044 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3045    
3046     ## Stay in the state
3047    
3048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3049     $self->{line_prev} = $self->{line};
3050     $self->{column_prev} = $self->{column};
3051     $self->{column}++;
3052     $self->{nc}
3053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3054     } else {
3055     $self->{set_nc}->($self);
3056     }
3057    
3058     redo A;
3059     } elsif ($self->{nc} == 0x003E) { # >
3060    
3061 wakaba 1.12 ## XML5: No parse error.
3062 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3063     $self->{state} = DATA_STATE;
3064 wakaba 1.5 $self->{s_kwd} = '';
3065 wakaba 1.1
3066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067     $self->{line_prev} = $self->{line};
3068     $self->{column_prev} = $self->{column};
3069     $self->{column}++;
3070     $self->{nc}
3071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072     } else {
3073     $self->{set_nc}->($self);
3074     }
3075    
3076    
3077     return ($self->{ct}); # DOCTYPE (quirks)
3078    
3079     redo A;
3080     } elsif ($self->{nc} == -1) {
3081    
3082     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3083     $self->{state} = DATA_STATE;
3084 wakaba 1.5 $self->{s_kwd} = '';
3085 wakaba 1.1 ## reconsume
3086    
3087     return ($self->{ct}); # DOCTYPE (quirks)
3088    
3089     redo A;
3090 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3091    
3092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3093     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3094 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3095     $self->{in_subset} = 1;
3096 wakaba 1.12
3097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3098     $self->{line_prev} = $self->{line};
3099     $self->{column_prev} = $self->{column};
3100     $self->{column}++;
3101     $self->{nc}
3102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3103     } else {
3104     $self->{set_nc}->($self);
3105     }
3106    
3107 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3108 wakaba 1.12 redo A;
3109 wakaba 1.1 } else {
3110    
3111     $self->{ct}->{name} = chr $self->{nc};
3112     delete $self->{ct}->{quirks};
3113     $self->{state} = DOCTYPE_NAME_STATE;
3114    
3115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3116     $self->{line_prev} = $self->{line};
3117     $self->{column_prev} = $self->{column};
3118     $self->{column}++;
3119     $self->{nc}
3120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3121     } else {
3122     $self->{set_nc}->($self);
3123     }
3124    
3125     redo A;
3126     }
3127     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3128 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3129    
3130     ## ISSUE: Redundant "First," in the spec.
3131    
3132 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3133    
3134     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3135    
3136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137     $self->{line_prev} = $self->{line};
3138     $self->{column_prev} = $self->{column};
3139     $self->{column}++;
3140     $self->{nc}
3141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142     } else {
3143     $self->{set_nc}->($self);
3144     }
3145    
3146     redo A;
3147     } elsif ($self->{nc} == 0x003E) { # >
3148    
3149     $self->{state} = DATA_STATE;
3150 wakaba 1.5 $self->{s_kwd} = '';
3151 wakaba 1.1
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162    
3163     return ($self->{ct}); # DOCTYPE
3164    
3165     redo A;
3166     } elsif ($self->{nc} == -1) {
3167    
3168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3169     $self->{state} = DATA_STATE;
3170 wakaba 1.5 $self->{s_kwd} = '';
3171 wakaba 1.1 ## reconsume
3172    
3173     $self->{ct}->{quirks} = 1;
3174     return ($self->{ct}); # DOCTYPE
3175    
3176     redo A;
3177 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3178    
3179     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3180 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3181     $self->{in_subset} = 1;
3182 wakaba 1.12
3183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3184     $self->{line_prev} = $self->{line};
3185     $self->{column_prev} = $self->{column};
3186     $self->{column}++;
3187     $self->{nc}
3188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3189     } else {
3190     $self->{set_nc}->($self);
3191     }
3192    
3193 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3194 wakaba 1.12 redo A;
3195 wakaba 1.1 } else {
3196    
3197     $self->{ct}->{name}
3198     .= chr ($self->{nc}); # DOCTYPE
3199     ## Stay in the state
3200    
3201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202     $self->{line_prev} = $self->{line};
3203     $self->{column_prev} = $self->{column};
3204     $self->{column}++;
3205     $self->{nc}
3206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207     } else {
3208     $self->{set_nc}->($self);
3209     }
3210    
3211     redo A;
3212     }
3213     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3214 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3215     ## state", but implemented differently.
3216    
3217 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3218    
3219     ## Stay in the state
3220    
3221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3222     $self->{line_prev} = $self->{line};
3223     $self->{column_prev} = $self->{column};
3224     $self->{column}++;
3225     $self->{nc}
3226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3227     } else {
3228     $self->{set_nc}->($self);
3229     }
3230    
3231     redo A;
3232     } elsif ($self->{nc} == 0x003E) { # >
3233 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3234    
3235     $self->{state} = DATA_STATE;
3236     $self->{s_kwd} = '';
3237     } else {
3238    
3239     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3240     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3241     }
3242 wakaba 1.1
3243    
3244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3245     $self->{line_prev} = $self->{line};
3246     $self->{column_prev} = $self->{column};
3247     $self->{column}++;
3248     $self->{nc}
3249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3250     } else {
3251     $self->{set_nc}->($self);
3252     }
3253    
3254 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3255 wakaba 1.1 redo A;
3256     } elsif ($self->{nc} == -1) {
3257 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3258    
3259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3260     $self->{state} = DATA_STATE;
3261     $self->{s_kwd} = '';
3262     $self->{ct}->{quirks} = 1;
3263     } else {
3264    
3265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3266     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3267     }
3268 wakaba 1.1
3269 wakaba 1.16 ## Reconsume.
3270     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3271 wakaba 1.1 redo A;
3272     } elsif ($self->{nc} == 0x0050 or # P
3273     $self->{nc} == 0x0070) { # p
3274 wakaba 1.12
3275 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3276 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3277 wakaba 1.1
3278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3279     $self->{line_prev} = $self->{line};
3280     $self->{column_prev} = $self->{column};
3281     $self->{column}++;
3282     $self->{nc}
3283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3284     } else {
3285     $self->{set_nc}->($self);
3286     }
3287    
3288     redo A;
3289     } elsif ($self->{nc} == 0x0053 or # S
3290     $self->{nc} == 0x0073) { # s
3291 wakaba 1.12
3292 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3293 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3294    
3295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3296     $self->{line_prev} = $self->{line};
3297     $self->{column_prev} = $self->{column};
3298     $self->{column}++;
3299     $self->{nc}
3300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3301     } else {
3302     $self->{set_nc}->($self);
3303     }
3304    
3305     redo A;
3306 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3307     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3308     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3309    
3310     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3311     $self->{ct}->{value} = ''; # ENTITY
3312    
3313     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3314     $self->{line_prev} = $self->{line};
3315     $self->{column_prev} = $self->{column};
3316     $self->{column}++;
3317     $self->{nc}
3318     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3319     } else {
3320     $self->{set_nc}->($self);
3321     }
3322    
3323     redo A;
3324     } elsif ($self->{nc} == 0x0027 and # '
3325     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3326     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3327    
3328     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3329     $self->{ct}->{value} = ''; # ENTITY
3330    
3331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332     $self->{line_prev} = $self->{line};
3333     $self->{column_prev} = $self->{column};
3334     $self->{column}++;
3335     $self->{nc}
3336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337     } else {
3338     $self->{set_nc}->($self);
3339     }
3340    
3341     redo A;
3342 wakaba 1.16 } elsif ($self->{is_xml} and
3343     $self->{ct}->{type} == DOCTYPE_TOKEN and
3344     $self->{nc} == 0x005B) { # [
3345 wakaba 1.12
3346     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3347     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3348 wakaba 1.13 $self->{in_subset} = 1;
3349 wakaba 1.1
3350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3351     $self->{line_prev} = $self->{line};
3352     $self->{column_prev} = $self->{column};
3353     $self->{column}++;
3354     $self->{nc}
3355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3356     } else {
3357     $self->{set_nc}->($self);
3358     }
3359    
3360 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3361 wakaba 1.1 redo A;
3362     } else {
3363 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3364    
3365     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3366    
3367     $self->{ct}->{quirks} = 1;
3368     $self->{state} = BOGUS_DOCTYPE_STATE;
3369     } else {
3370    
3371     $self->{state} = BOGUS_MD_STATE;
3372     }
3373 wakaba 1.1
3374    
3375     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3376     $self->{line_prev} = $self->{line};
3377     $self->{column_prev} = $self->{column};
3378     $self->{column}++;
3379     $self->{nc}
3380     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3381     } else {
3382     $self->{set_nc}->($self);
3383     }
3384    
3385     redo A;
3386     }
3387     } elsif ($self->{state} == PUBLIC_STATE) {
3388     ## ASCII case-insensitive
3389     if ($self->{nc} == [
3390     undef,
3391     0x0055, # U
3392     0x0042, # B
3393     0x004C, # L
3394     0x0049, # I
3395 wakaba 1.12 ]->[length $self->{kwd}] or
3396 wakaba 1.1 $self->{nc} == [
3397     undef,
3398     0x0075, # u
3399     0x0062, # b
3400     0x006C, # l
3401     0x0069, # i
3402 wakaba 1.12 ]->[length $self->{kwd}]) {
3403 wakaba 1.1
3404     ## Stay in the state.
3405 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3406 wakaba 1.1
3407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3408     $self->{line_prev} = $self->{line};
3409     $self->{column_prev} = $self->{column};
3410     $self->{column}++;
3411     $self->{nc}
3412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3413     } else {
3414     $self->{set_nc}->($self);
3415     }
3416    
3417     redo A;
3418 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3419 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3420     $self->{nc} == 0x0063)) { # c
3421 wakaba 1.12 if ($self->{is_xml} and
3422     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3423    
3424     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3425     text => 'PUBLIC',
3426     line => $self->{line_prev},
3427     column => $self->{column_prev} - 4);
3428     } else {
3429    
3430     }
3431 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3432    
3433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3434     $self->{line_prev} = $self->{line};
3435     $self->{column_prev} = $self->{column};
3436     $self->{column}++;
3437     $self->{nc}
3438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3439     } else {
3440     $self->{set_nc}->($self);
3441     }
3442    
3443     redo A;
3444     } else {
3445 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3446 wakaba 1.1 line => $self->{line_prev},
3447 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3448 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3449    
3450     $self->{ct}->{quirks} = 1;
3451     $self->{state} = BOGUS_DOCTYPE_STATE;
3452     } else {
3453    
3454     $self->{state} = BOGUS_MD_STATE;
3455     }
3456 wakaba 1.1 ## Reconsume.
3457     redo A;
3458     }
3459     } elsif ($self->{state} == SYSTEM_STATE) {
3460     ## ASCII case-insensitive
3461     if ($self->{nc} == [
3462     undef,
3463     0x0059, # Y
3464     0x0053, # S
3465     0x0054, # T
3466     0x0045, # E
3467 wakaba 1.12 ]->[length $self->{kwd}] or
3468 wakaba 1.1 $self->{nc} == [
3469     undef,
3470     0x0079, # y
3471     0x0073, # s
3472     0x0074, # t
3473     0x0065, # e
3474 wakaba 1.12 ]->[length $self->{kwd}]) {
3475 wakaba 1.1
3476     ## Stay in the state.
3477 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3478 wakaba 1.1
3479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3480     $self->{line_prev} = $self->{line};
3481     $self->{column_prev} = $self->{column};
3482     $self->{column}++;
3483     $self->{nc}
3484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3485     } else {
3486     $self->{set_nc}->($self);
3487     }
3488    
3489     redo A;
3490 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3491 wakaba 1.1 ($self->{nc} == 0x004D or # M
3492     $self->{nc} == 0x006D)) { # m
3493 wakaba 1.12 if ($self->{is_xml} and
3494     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3495    
3496     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3497     text => 'SYSTEM',
3498     line => $self->{line_prev},
3499     column => $self->{column_prev} - 4);
3500     } else {
3501    
3502     }
3503 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } else {
3517 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3518 wakaba 1.1 line => $self->{line_prev},
3519 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3520 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3521    
3522     $self->{ct}->{quirks} = 1;
3523     $self->{state} = BOGUS_DOCTYPE_STATE;
3524     } else {
3525    
3526     $self->{state} = BOGUS_MD_STATE;
3527     }
3528 wakaba 1.1 ## Reconsume.
3529     redo A;
3530     }
3531     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3532     if ($is_space->{$self->{nc}}) {
3533    
3534     ## Stay in the state
3535    
3536     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3537     $self->{line_prev} = $self->{line};
3538     $self->{column_prev} = $self->{column};
3539     $self->{column}++;
3540     $self->{nc}
3541     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3542     } else {
3543     $self->{set_nc}->($self);
3544     }
3545    
3546     redo A;
3547     } elsif ($self->{nc} eq 0x0022) { # "
3548    
3549     $self->{ct}->{pubid} = ''; # DOCTYPE
3550     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3551    
3552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3553     $self->{line_prev} = $self->{line};
3554     $self->{column_prev} = $self->{column};
3555     $self->{column}++;
3556     $self->{nc}
3557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3558     } else {
3559     $self->{set_nc}->($self);
3560     }
3561    
3562     redo A;
3563     } elsif ($self->{nc} eq 0x0027) { # '
3564    
3565     $self->{ct}->{pubid} = ''; # DOCTYPE
3566     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3567    
3568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3569     $self->{line_prev} = $self->{line};
3570     $self->{column_prev} = $self->{column};
3571     $self->{column}++;
3572     $self->{nc}
3573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3574     } else {
3575     $self->{set_nc}->($self);
3576     }
3577    
3578     redo A;
3579     } elsif ($self->{nc} eq 0x003E) { # >
3580 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3581    
3582     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3583    
3584     $self->{state} = DATA_STATE;
3585     $self->{s_kwd} = '';
3586     $self->{ct}->{quirks} = 1;
3587     } else {
3588    
3589     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3590     }
3591 wakaba 1.1
3592    
3593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3594     $self->{line_prev} = $self->{line};
3595     $self->{column_prev} = $self->{column};
3596     $self->{column}++;
3597     $self->{nc}
3598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3599     } else {
3600     $self->{set_nc}->($self);
3601     }
3602    
3603 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3604 wakaba 1.1 redo A;
3605     } elsif ($self->{nc} == -1) {
3606 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3607    
3608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3609     $self->{state} = DATA_STATE;
3610     $self->{s_kwd} = '';
3611     $self->{ct}->{quirks} = 1;
3612     } else {
3613    
3614     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3615     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3616     }
3617 wakaba 1.1
3618     ## reconsume
3619     return ($self->{ct}); # DOCTYPE
3620     redo A;
3621 wakaba 1.16 } elsif ($self->{is_xml} and
3622     $self->{ct}->{type} == DOCTYPE_TOKEN and
3623     $self->{nc} == 0x005B) { # [
3624 wakaba 1.12
3625     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3626     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3627     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3628 wakaba 1.13 $self->{in_subset} = 1;
3629 wakaba 1.12
3630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3631     $self->{line_prev} = $self->{line};
3632     $self->{column_prev} = $self->{column};
3633     $self->{column}++;
3634     $self->{nc}
3635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3636     } else {
3637     $self->{set_nc}->($self);
3638     }
3639    
3640 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3641 wakaba 1.12 redo A;
3642 wakaba 1.1 } else {
3643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3644    
3645 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3646    
3647     $self->{ct}->{quirks} = 1;
3648     $self->{state} = BOGUS_DOCTYPE_STATE;
3649     } else {
3650    
3651     $self->{state} = BOGUS_MD_STATE;
3652     }
3653    
3654 wakaba 1.1
3655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3656     $self->{line_prev} = $self->{line};
3657     $self->{column_prev} = $self->{column};
3658     $self->{column}++;
3659     $self->{nc}
3660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3661     } else {
3662     $self->{set_nc}->($self);
3663     }
3664    
3665     redo A;
3666     }
3667     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3668     if ($self->{nc} == 0x0022) { # "
3669    
3670     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3671    
3672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3673     $self->{line_prev} = $self->{line};
3674     $self->{column_prev} = $self->{column};
3675     $self->{column}++;
3676     $self->{nc}
3677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3678     } else {
3679     $self->{set_nc}->($self);
3680     }
3681    
3682     redo A;
3683     } elsif ($self->{nc} == 0x003E) { # >
3684     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3685    
3686 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3687    
3688     $self->{state} = DATA_STATE;
3689     $self->{s_kwd} = '';
3690     $self->{ct}->{quirks} = 1;
3691     } else {
3692    
3693     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3694     }
3695    
3696 wakaba 1.1
3697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3698     $self->{line_prev} = $self->{line};
3699     $self->{column_prev} = $self->{column};
3700     $self->{column}++;
3701     $self->{nc}
3702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3703     } else {
3704     $self->{set_nc}->($self);
3705     }
3706    
3707 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3708 wakaba 1.1 redo A;
3709     } elsif ($self->{nc} == -1) {
3710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3711    
3712 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3713    
3714     $self->{state} = DATA_STATE;
3715     $self->{s_kwd} = '';
3716     $self->{ct}->{quirks} = 1;
3717     } else {
3718    
3719     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3720     }
3721    
3722     ## Reconsume.
3723 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3724     redo A;
3725     } else {
3726    
3727 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3728 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3729     length $self->{ct}->{pubid});
3730    
3731     ## Stay in the state
3732    
3733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3734     $self->{line_prev} = $self->{line};
3735     $self->{column_prev} = $self->{column};
3736     $self->{column}++;
3737     $self->{nc}
3738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3739     } else {
3740     $self->{set_nc}->($self);
3741     }
3742    
3743     redo A;
3744     }
3745     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3746     if ($self->{nc} == 0x0027) { # '
3747    
3748     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3749    
3750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3751     $self->{line_prev} = $self->{line};
3752     $self->{column_prev} = $self->{column};
3753     $self->{column}++;
3754     $self->{nc}
3755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3756     } else {
3757     $self->{set_nc}->($self);
3758     }
3759    
3760     redo A;
3761     } elsif ($self->{nc} == 0x003E) { # >
3762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3763    
3764 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3765    
3766     $self->{state} = DATA_STATE;
3767     $self->{s_kwd} = '';
3768     $self->{ct}->{quirks} = 1;
3769     } else {
3770    
3771     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3772     }
3773    
3774 wakaba 1.1
3775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3776     $self->{line_prev} = $self->{line};
3777     $self->{column_prev} = $self->{column};
3778     $self->{column}++;
3779     $self->{nc}
3780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3781     } else {
3782     $self->{set_nc}->($self);
3783     }
3784    
3785 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3786 wakaba 1.1 redo A;
3787     } elsif ($self->{nc} == -1) {
3788     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3789    
3790 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3791    
3792     $self->{state} = DATA_STATE;
3793     $self->{s_kwd} = '';
3794     $self->{ct}->{quirks} = 1;
3795     } else {
3796    
3797     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3798     }
3799    
3800 wakaba 1.1 ## reconsume
3801 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3802 wakaba 1.1 redo A;
3803     } else {
3804    
3805 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3806 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3807     length $self->{ct}->{pubid});
3808    
3809     ## Stay in the state
3810    
3811     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3812     $self->{line_prev} = $self->{line};
3813     $self->{column_prev} = $self->{column};
3814     $self->{column}++;
3815     $self->{nc}
3816     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3817     } else {
3818     $self->{set_nc}->($self);
3819     }
3820    
3821     redo A;
3822     }
3823     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3824     if ($is_space->{$self->{nc}}) {
3825    
3826     ## Stay in the state
3827    
3828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3829     $self->{line_prev} = $self->{line};
3830     $self->{column_prev} = $self->{column};
3831     $self->{column}++;
3832     $self->{nc}
3833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3834     } else {
3835     $self->{set_nc}->($self);
3836     }
3837    
3838     redo A;
3839     } elsif ($self->{nc} == 0x0022) { # "
3840    
3841 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3842 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3843    
3844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3845     $self->{line_prev} = $self->{line};
3846     $self->{column_prev} = $self->{column};
3847     $self->{column}++;
3848     $self->{nc}
3849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3850     } else {
3851     $self->{set_nc}->($self);
3852     }
3853    
3854     redo A;
3855     } elsif ($self->{nc} == 0x0027) { # '
3856    
3857 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3858 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3859    
3860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3861     $self->{line_prev} = $self->{line};
3862     $self->{column_prev} = $self->{column};
3863     $self->{column}++;
3864     $self->{nc}
3865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3866     } else {
3867     $self->{set_nc}->($self);
3868     }
3869    
3870     redo A;
3871     } elsif ($self->{nc} == 0x003E) { # >
3872 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3873     if ($self->{is_xml}) {
3874    
3875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3876     } else {
3877    
3878     }
3879     $self->{state} = DATA_STATE;
3880     $self->{s_kwd} = '';
3881 wakaba 1.12 } else {
3882 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3883    
3884     } else {
3885    
3886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3887     }
3888     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3889 wakaba 1.12 }
3890 wakaba 1.16
3891 wakaba 1.1
3892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3893     $self->{line_prev} = $self->{line};
3894     $self->{column_prev} = $self->{column};
3895     $self->{column}++;
3896     $self->{nc}
3897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3898     } else {
3899     $self->{set_nc}->($self);
3900     }
3901    
3902 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3903 wakaba 1.1 redo A;
3904     } elsif ($self->{nc} == -1) {
3905 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3906    
3907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3908    
3909     $self->{state} = DATA_STATE;
3910     $self->{s_kwd} = '';
3911     $self->{ct}->{quirks} = 1;
3912     } else {
3913     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3914     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3915     }
3916 wakaba 1.1
3917     ## reconsume
3918 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3919 wakaba 1.1 redo A;
3920 wakaba 1.16 } elsif ($self->{is_xml} and
3921     $self->{ct}->{type} == DOCTYPE_TOKEN and
3922     $self->{nc} == 0x005B) { # [
3923 wakaba 1.12
3924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3926     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3927 wakaba 1.13 $self->{in_subset} = 1;
3928 wakaba 1.12
3929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3930     $self->{line_prev} = $self->{line};
3931     $self->{column_prev} = $self->{column};
3932     $self->{column}++;
3933     $self->{nc}
3934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3935     } else {
3936     $self->{set_nc}->($self);
3937     }
3938    
3939 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3940 wakaba 1.12 redo A;
3941 wakaba 1.1 } else {
3942     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3943    
3944 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3945    
3946     $self->{ct}->{quirks} = 1;
3947     $self->{state} = BOGUS_DOCTYPE_STATE;
3948     } else {
3949    
3950     $self->{state} = BOGUS_MD_STATE;
3951     }
3952    
3953 wakaba 1.1
3954     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3955     $self->{line_prev} = $self->{line};
3956     $self->{column_prev} = $self->{column};
3957     $self->{column}++;
3958     $self->{nc}
3959     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3960     } else {
3961     $self->{set_nc}->($self);
3962     }
3963    
3964     redo A;
3965     }
3966     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3967     if ($is_space->{$self->{nc}}) {
3968    
3969     ## Stay in the state
3970    
3971     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3972     $self->{line_prev} = $self->{line};
3973     $self->{column_prev} = $self->{column};
3974     $self->{column}++;
3975     $self->{nc}
3976     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3977     } else {
3978     $self->{set_nc}->($self);
3979     }
3980    
3981     redo A;
3982     } elsif ($self->{nc} == 0x0022) { # "
3983    
3984     $self->{ct}->{sysid} = ''; # DOCTYPE
3985     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3986    
3987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988     $self->{line_prev} = $self->{line};
3989     $self->{column_prev} = $self->{column};
3990     $self->{column}++;
3991     $self->{nc}
3992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3993     } else {
3994     $self->{set_nc}->($self);
3995     }
3996    
3997     redo A;
3998     } elsif ($self->{nc} == 0x0027) { # '
3999    
4000     $self->{ct}->{sysid} = ''; # DOCTYPE
4001     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4002    
4003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4004     $self->{line_prev} = $self->{line};
4005     $self->{column_prev} = $self->{column};
4006     $self->{column}++;
4007     $self->{nc}
4008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4009     } else {
4010     $self->{set_nc}->($self);
4011     }
4012    
4013     redo A;
4014     } elsif ($self->{nc} == 0x003E) { # >
4015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4016    
4017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4018     $self->{line_prev} = $self->{line};
4019     $self->{column_prev} = $self->{column};
4020     $self->{column}++;
4021     $self->{nc}
4022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4023     } else {
4024     $self->{set_nc}->($self);
4025     }
4026    
4027    
4028 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4029    
4030     $self->{state} = DATA_STATE;
4031     $self->{s_kwd} = '';
4032     $self->{ct}->{quirks} = 1;
4033     } else {
4034    
4035     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4036     }
4037 wakaba 1.1
4038 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4039 wakaba 1.1 redo A;
4040     } elsif ($self->{nc} == -1) {
4041 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4042    
4043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4044     $self->{state} = DATA_STATE;
4045     $self->{s_kwd} = '';
4046     $self->{ct}->{quirks} = 1;
4047     } else {
4048    
4049     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4050     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4051     }
4052 wakaba 1.1
4053     ## reconsume
4054 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4055 wakaba 1.1 redo A;
4056 wakaba 1.16 } elsif ($self->{is_xml} and
4057     $self->{ct}->{type} == DOCTYPE_TOKEN and
4058     $self->{nc} == 0x005B) { # [
4059 wakaba 1.12
4060     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4061    
4062     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4063     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4064 wakaba 1.13 $self->{in_subset} = 1;
4065 wakaba 1.12
4066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4067     $self->{line_prev} = $self->{line};
4068     $self->{column_prev} = $self->{column};
4069     $self->{column}++;
4070     $self->{nc}
4071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4072     } else {
4073     $self->{set_nc}->($self);
4074     }
4075    
4076 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4077 wakaba 1.12 redo A;
4078 wakaba 1.1 } else {
4079     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4080    
4081 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4082    
4083     $self->{ct}->{quirks} = 1;
4084     $self->{state} = BOGUS_DOCTYPE_STATE;
4085     } else {
4086    
4087     $self->{state} = BOGUS_MD_STATE;
4088     }
4089    
4090 wakaba 1.1
4091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4092     $self->{line_prev} = $self->{line};
4093     $self->{column_prev} = $self->{column};
4094     $self->{column}++;
4095     $self->{nc}
4096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4097     } else {
4098     $self->{set_nc}->($self);
4099     }
4100    
4101     redo A;
4102     }
4103     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4104     if ($self->{nc} == 0x0022) { # "
4105    
4106     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4107    
4108     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4109     $self->{line_prev} = $self->{line};
4110     $self->{column_prev} = $self->{column};
4111     $self->{column}++;
4112     $self->{nc}
4113     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4114     } else {
4115     $self->{set_nc}->($self);
4116     }
4117    
4118     redo A;
4119 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4120 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4121    
4122 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4123    
4124     $self->{state} = DATA_STATE;
4125     $self->{s_kwd} = '';
4126     $self->{ct}->{quirks} = 1;
4127     } else {
4128    
4129     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4130     }
4131    
4132 wakaba 1.1
4133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4134     $self->{line_prev} = $self->{line};
4135     $self->{column_prev} = $self->{column};
4136     $self->{column}++;
4137     $self->{nc}
4138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4139     } else {
4140     $self->{set_nc}->($self);
4141     }
4142    
4143 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4144 wakaba 1.1 redo A;
4145     } elsif ($self->{nc} == -1) {
4146     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4147    
4148 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4149    
4150     $self->{state} = DATA_STATE;
4151     $self->{s_kwd} = '';
4152     $self->{ct}->{quirks} = 1;
4153     } else {
4154    
4155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4156     }
4157    
4158 wakaba 1.1 ## reconsume
4159 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4160 wakaba 1.1 redo A;
4161     } else {
4162    
4163 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4164 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4165     length $self->{ct}->{sysid});
4166    
4167     ## Stay in the state
4168    
4169     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4170     $self->{line_prev} = $self->{line};
4171     $self->{column_prev} = $self->{column};
4172     $self->{column}++;
4173     $self->{nc}
4174     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4175     } else {
4176     $self->{set_nc}->($self);
4177     }
4178    
4179     redo A;
4180     }
4181     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4182     if ($self->{nc} == 0x0027) { # '
4183    
4184     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4185    
4186     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4187     $self->{line_prev} = $self->{line};
4188     $self->{column_prev} = $self->{column};
4189     $self->{column}++;
4190     $self->{nc}
4191     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4192     } else {
4193     $self->{set_nc}->($self);
4194     }
4195    
4196     redo A;
4197 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4198 wakaba 1.1
4199     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4200    
4201     $self->{state} = DATA_STATE;
4202 wakaba 1.5 $self->{s_kwd} = '';
4203 wakaba 1.1
4204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4205     $self->{line_prev} = $self->{line};
4206     $self->{column_prev} = $self->{column};
4207     $self->{column}++;
4208     $self->{nc}
4209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4210     } else {
4211     $self->{set_nc}->($self);
4212     }
4213    
4214    
4215     $self->{ct}->{quirks} = 1;
4216     return ($self->{ct}); # DOCTYPE
4217    
4218     redo A;
4219     } elsif ($self->{nc} == -1) {
4220     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4221    
4222 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4223    
4224     $self->{state} = DATA_STATE;
4225     $self->{s_kwd} = '';
4226     $self->{ct}->{quirks} = 1;
4227     } else {
4228    
4229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4230     }
4231    
4232 wakaba 1.1 ## reconsume
4233 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4234 wakaba 1.1 redo A;
4235     } else {
4236    
4237 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4238 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4239     length $self->{ct}->{sysid});
4240    
4241     ## Stay in the state
4242    
4243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4244     $self->{line_prev} = $self->{line};
4245     $self->{column_prev} = $self->{column};
4246     $self->{column}++;
4247     $self->{nc}
4248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4249     } else {
4250     $self->{set_nc}->($self);
4251     }
4252    
4253     redo A;
4254     }
4255     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4256     if ($is_space->{$self->{nc}}) {
4257 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4258    
4259     $self->{state} = BEFORE_NDATA_STATE;
4260     } else {
4261    
4262     ## Stay in the state
4263     }
4264 wakaba 1.1
4265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4266     $self->{line_prev} = $self->{line};
4267     $self->{column_prev} = $self->{column};
4268     $self->{column}++;
4269     $self->{nc}
4270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4271     } else {
4272     $self->{set_nc}->($self);
4273     }
4274    
4275     redo A;
4276     } elsif ($self->{nc} == 0x003E) { # >
4277 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4278    
4279     $self->{state} = DATA_STATE;
4280     $self->{s_kwd} = '';
4281     } else {
4282    
4283     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4284     }
4285    
4286 wakaba 1.1
4287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4288     $self->{line_prev} = $self->{line};
4289     $self->{column_prev} = $self->{column};
4290     $self->{column}++;
4291     $self->{nc}
4292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4293     } else {
4294     $self->{set_nc}->($self);
4295     }
4296    
4297 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4298 wakaba 1.1 redo A;
4299 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4300     ($self->{nc} == 0x004E or # N
4301     $self->{nc} == 0x006E)) { # n
4302    
4303     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4304     $self->{state} = NDATA_STATE;
4305     $self->{kwd} = chr $self->{nc};
4306    
4307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4308     $self->{line_prev} = $self->{line};
4309     $self->{column_prev} = $self->{column};
4310     $self->{column}++;
4311     $self->{nc}
4312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4313     } else {
4314     $self->{set_nc}->($self);
4315     }
4316    
4317     redo A;
4318 wakaba 1.1 } elsif ($self->{nc} == -1) {
4319 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4320    
4321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4322     $self->{state} = DATA_STATE;
4323     $self->{s_kwd} = '';
4324     $self->{ct}->{quirks} = 1;
4325     } else {
4326    
4327     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4328     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4329     }
4330    
4331 wakaba 1.1 ## reconsume
4332 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4333 wakaba 1.1 redo A;
4334 wakaba 1.16 } elsif ($self->{is_xml} and
4335     $self->{ct}->{type} == DOCTYPE_TOKEN and
4336     $self->{nc} == 0x005B) { # [
4337 wakaba 1.12
4338     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4339     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4340 wakaba 1.13 $self->{in_subset} = 1;
4341 wakaba 1.12
4342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4343     $self->{line_prev} = $self->{line};
4344     $self->{column_prev} = $self->{column};
4345     $self->{column}++;
4346     $self->{nc}
4347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4348     } else {
4349     $self->{set_nc}->($self);
4350     }
4351    
4352 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4353 wakaba 1.12 redo A;
4354 wakaba 1.1 } else {
4355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4356    
4357 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4358    
4359     #$self->{ct}->{quirks} = 1;
4360     $self->{state} = BOGUS_DOCTYPE_STATE;
4361     } else {
4362    
4363     $self->{state} = BOGUS_MD_STATE;
4364     }
4365    
4366 wakaba 1.1
4367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4368     $self->{line_prev} = $self->{line};
4369     $self->{column_prev} = $self->{column};
4370     $self->{column}++;
4371     $self->{nc}
4372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4373     } else {
4374     $self->{set_nc}->($self);
4375     }
4376    
4377     redo A;
4378     }
4379 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4380     if ($is_space->{$self->{nc}}) {
4381    
4382     ## Stay in the state.
4383    
4384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4385     $self->{line_prev} = $self->{line};
4386     $self->{column_prev} = $self->{column};
4387     $self->{column}++;
4388     $self->{nc}
4389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4390     } else {
4391     $self->{set_nc}->($self);
4392     }
4393    
4394     redo A;
4395     } elsif ($self->{nc} == 0x003E) { # >
4396    
4397     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4398    
4399     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4400     $self->{line_prev} = $self->{line};
4401     $self->{column_prev} = $self->{column};
4402     $self->{column}++;
4403     $self->{nc}
4404     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4405     } else {
4406     $self->{set_nc}->($self);
4407     }
4408    
4409     return ($self->{ct}); # ENTITY
4410     redo A;
4411     } elsif ($self->{nc} == 0x004E or # N
4412     $self->{nc} == 0x006E) { # n
4413    
4414     $self->{state} = NDATA_STATE;
4415     $self->{kwd} = chr $self->{nc};
4416    
4417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4418     $self->{line_prev} = $self->{line};
4419     $self->{column_prev} = $self->{column};
4420     $self->{column}++;
4421     $self->{nc}
4422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4423     } else {
4424     $self->{set_nc}->($self);
4425     }
4426    
4427     redo A;
4428     } elsif ($self->{nc} == -1) {
4429    
4430     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4431     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4432     ## reconsume
4433     return ($self->{ct}); # ENTITY
4434     redo A;
4435     } else {
4436    
4437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4438     $self->{state} = BOGUS_MD_STATE;
4439    
4440     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4441     $self->{line_prev} = $self->{line};
4442     $self->{column_prev} = $self->{column};
4443     $self->{column}++;
4444     $self->{nc}
4445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4446     } else {
4447     $self->{set_nc}->($self);
4448     }
4449    
4450     redo A;
4451     }
4452 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4453     if ($self->{nc} == 0x003E) { # >
4454    
4455     $self->{state} = DATA_STATE;
4456 wakaba 1.5 $self->{s_kwd} = '';
4457 wakaba 1.1
4458     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4459     $self->{line_prev} = $self->{line};
4460     $self->{column_prev} = $self->{column};
4461     $self->{column}++;
4462     $self->{nc}
4463     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4464     } else {
4465     $self->{set_nc}->($self);
4466     }
4467    
4468    
4469     return ($self->{ct}); # DOCTYPE
4470    
4471     redo A;
4472 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4473 wakaba 1.13
4474     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4475     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4476     $self->{in_subset} = 1;
4477    
4478 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4479     $self->{line_prev} = $self->{line};
4480     $self->{column_prev} = $self->{column};
4481     $self->{column}++;
4482     $self->{nc}
4483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4484     } else {
4485     $self->{set_nc}->($self);
4486     }
4487    
4488 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4489     redo A;
4490 wakaba 1.1 } elsif ($self->{nc} == -1) {
4491    
4492     $self->{state} = DATA_STATE;
4493 wakaba 1.5 $self->{s_kwd} = '';
4494 wakaba 1.1 ## reconsume
4495    
4496     return ($self->{ct}); # DOCTYPE
4497    
4498     redo A;
4499     } else {
4500    
4501     my $s = '';
4502 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4503 wakaba 1.1
4504     ## Stay in the state
4505    
4506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4507     $self->{line_prev} = $self->{line};
4508     $self->{column_prev} = $self->{column};
4509     $self->{column}++;
4510     $self->{nc}
4511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4512     } else {
4513     $self->{set_nc}->($self);
4514     }
4515    
4516     redo A;
4517     }
4518     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4519     ## NOTE: "CDATA section state" in the state is jointly implemented
4520     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4521     ## and |CDATA_SECTION_MSE2_STATE|.
4522 wakaba 1.10
4523     ## XML5: "CDATA state".
4524 wakaba 1.1
4525     if ($self->{nc} == 0x005D) { # ]
4526    
4527     $self->{state} = CDATA_SECTION_MSE1_STATE;
4528    
4529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4530     $self->{line_prev} = $self->{line};
4531     $self->{column_prev} = $self->{column};
4532     $self->{column}++;
4533     $self->{nc}
4534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4535     } else {
4536     $self->{set_nc}->($self);
4537     }
4538    
4539     redo A;
4540     } elsif ($self->{nc} == -1) {
4541 wakaba 1.6 if ($self->{is_xml}) {
4542 wakaba 1.8
4543 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4544 wakaba 1.8 } else {
4545    
4546 wakaba 1.6 }
4547    
4548 wakaba 1.1 $self->{state} = DATA_STATE;
4549 wakaba 1.5 $self->{s_kwd} = '';
4550 wakaba 1.10 ## Reconsume.
4551 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4552    
4553     return ($self->{ct}); # character
4554     } else {
4555    
4556     ## No token to emit. $self->{ct} is discarded.
4557     }
4558     redo A;
4559     } else {
4560    
4561     $self->{ct}->{data} .= chr $self->{nc};
4562     $self->{read_until}->($self->{ct}->{data},
4563     q<]>,
4564     length $self->{ct}->{data});
4565    
4566     ## Stay in the state.
4567    
4568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4569     $self->{line_prev} = $self->{line};
4570     $self->{column_prev} = $self->{column};
4571     $self->{column}++;
4572     $self->{nc}
4573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4574     } else {
4575     $self->{set_nc}->($self);
4576     }
4577    
4578     redo A;
4579     }
4580    
4581     ## ISSUE: "text tokens" in spec.
4582     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4583 wakaba 1.10 ## XML5: "CDATA bracket state".
4584    
4585 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4586    
4587     $self->{state} = CDATA_SECTION_MSE2_STATE;
4588    
4589     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4590     $self->{line_prev} = $self->{line};
4591     $self->{column_prev} = $self->{column};
4592     $self->{column}++;
4593     $self->{nc}
4594     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4595     } else {
4596     $self->{set_nc}->($self);
4597     }
4598    
4599     redo A;
4600     } else {
4601    
4602 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4603 wakaba 1.1 $self->{ct}->{data} .= ']';
4604 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4605 wakaba 1.1 ## Reconsume.
4606     redo A;
4607     }
4608     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4609 wakaba 1.10 ## XML5: "CDATA end state".
4610    
4611 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4612     $self->{state} = DATA_STATE;
4613 wakaba 1.5 $self->{s_kwd} = '';
4614 wakaba 1.1
4615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4616     $self->{line_prev} = $self->{line};
4617     $self->{column_prev} = $self->{column};
4618     $self->{column}++;
4619     $self->{nc}
4620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4621     } else {
4622     $self->{set_nc}->($self);
4623     }
4624    
4625     if (length $self->{ct}->{data}) { # character
4626    
4627     return ($self->{ct}); # character
4628     } else {
4629    
4630     ## No token to emit. $self->{ct} is discarded.
4631     }
4632     redo A;
4633     } elsif ($self->{nc} == 0x005D) { # ]
4634     # character
4635     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4636     ## Stay in the state.
4637    
4638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4639     $self->{line_prev} = $self->{line};
4640     $self->{column_prev} = $self->{column};
4641     $self->{column}++;
4642     $self->{nc}
4643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4644     } else {
4645     $self->{set_nc}->($self);
4646     }
4647    
4648     redo A;
4649     } else {
4650    
4651     $self->{ct}->{data} .= ']]'; # character
4652     $self->{state} = CDATA_SECTION_STATE;
4653 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4654 wakaba 1.1 redo A;
4655     }
4656     } elsif ($self->{state} == ENTITY_STATE) {
4657     if ($is_space->{$self->{nc}} or
4658     {
4659     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4660     $self->{entity_add} => 1,
4661     }->{$self->{nc}}) {
4662 wakaba 1.22 if ($self->{is_xml}) {
4663    
4664     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4665     line => $self->{line_prev},
4666     column => $self->{column_prev}
4667     + ($self->{nc} == -1 ? 1 : 0));
4668     } else {
4669    
4670     ## No error
4671     }
4672 wakaba 1.1 ## Don't consume
4673     ## Return nothing.
4674     #
4675     } elsif ($self->{nc} == 0x0023) { # #
4676    
4677     $self->{state} = ENTITY_HASH_STATE;
4678 wakaba 1.12 $self->{kwd} = '#';
4679 wakaba 1.1
4680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4681     $self->{line_prev} = $self->{line};
4682     $self->{column_prev} = $self->{column};
4683     $self->{column}++;
4684     $self->{nc}
4685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4686     } else {
4687     $self->{set_nc}->($self);
4688     }
4689    
4690     redo A;
4691 wakaba 1.22 } elsif ($self->{is_xml} or
4692     (0x0041 <= $self->{nc} and
4693 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4694     (0x0061 <= $self->{nc} and
4695     $self->{nc} <= 0x007A)) { # a..z
4696    
4697     require Whatpm::_NamedEntityList;
4698     $self->{state} = ENTITY_NAME_STATE;
4699 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4700     $self->{entity__value} = $self->{kwd};
4701 wakaba 1.1 $self->{entity__match} = 0;
4702    
4703     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4704     $self->{line_prev} = $self->{line};
4705     $self->{column_prev} = $self->{column};
4706     $self->{column}++;
4707     $self->{nc}
4708     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4709     } else {
4710     $self->{set_nc}->($self);
4711     }
4712    
4713     redo A;
4714     } else {
4715    
4716     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4717     ## Return nothing.
4718     #
4719     }
4720    
4721     ## NOTE: No character is consumed by the "consume a character
4722     ## reference" algorithm. In other word, there is an "&" character
4723     ## that does not introduce a character reference, which would be
4724     ## appended to the parent element or the attribute value in later
4725     ## process of the tokenizer.
4726    
4727     if ($self->{prev_state} == DATA_STATE) {
4728    
4729     $self->{state} = $self->{prev_state};
4730 wakaba 1.5 $self->{s_kwd} = '';
4731 wakaba 1.1 ## Reconsume.
4732     return ({type => CHARACTER_TOKEN, data => '&',
4733     line => $self->{line_prev},
4734     column => $self->{column_prev},
4735     });
4736     redo A;
4737     } else {
4738    
4739     $self->{ca}->{value} .= '&';
4740     $self->{state} = $self->{prev_state};
4741 wakaba 1.5 $self->{s_kwd} = '';
4742 wakaba 1.1 ## Reconsume.
4743     redo A;
4744     }
4745     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4746 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4747 wakaba 1.1
4748     $self->{state} = HEXREF_X_STATE;
4749 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4750 wakaba 1.1
4751     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4752     $self->{line_prev} = $self->{line};
4753     $self->{column_prev} = $self->{column};
4754     $self->{column}++;
4755     $self->{nc}
4756     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4757     } else {
4758     $self->{set_nc}->($self);
4759     }
4760    
4761     redo A;
4762 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4763    
4764     if ($self->{is_xml}) {
4765     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4766     }
4767     $self->{state} = HEXREF_X_STATE;
4768     $self->{kwd} .= chr $self->{nc};
4769    
4770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4771     $self->{line_prev} = $self->{line};
4772     $self->{column_prev} = $self->{column};
4773     $self->{column}++;
4774     $self->{nc}
4775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4776     } else {
4777     $self->{set_nc}->($self);
4778     }
4779    
4780     redo A;
4781 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4782     $self->{nc} <= 0x0039) { # 0..9
4783    
4784     $self->{state} = NCR_NUM_STATE;
4785 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4786 wakaba 1.1
4787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4788     $self->{line_prev} = $self->{line};
4789     $self->{column_prev} = $self->{column};
4790     $self->{column}++;
4791     $self->{nc}
4792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4793     } else {
4794     $self->{set_nc}->($self);
4795     }
4796    
4797     redo A;
4798     } else {
4799     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4800     line => $self->{line_prev},
4801     column => $self->{column_prev} - 1);
4802    
4803     ## NOTE: According to the spec algorithm, nothing is returned,
4804     ## and then "&#" is appended to the parent element or the attribute
4805     ## value in the later processing.
4806    
4807     if ($self->{prev_state} == DATA_STATE) {
4808    
4809     $self->{state} = $self->{prev_state};
4810 wakaba 1.5 $self->{s_kwd} = '';
4811 wakaba 1.1 ## Reconsume.
4812     return ({type => CHARACTER_TOKEN,
4813     data => '&#',
4814     line => $self->{line_prev},
4815     column => $self->{column_prev} - 1,
4816     });
4817     redo A;
4818     } else {
4819    
4820     $self->{ca}->{value} .= '&#';
4821     $self->{state} = $self->{prev_state};
4822 wakaba 1.5 $self->{s_kwd} = '';
4823 wakaba 1.1 ## Reconsume.
4824     redo A;
4825     }
4826     }
4827     } elsif ($self->{state} == NCR_NUM_STATE) {
4828     if (0x0030 <= $self->{nc} and
4829     $self->{nc} <= 0x0039) { # 0..9
4830    
4831 wakaba 1.12 $self->{kwd} *= 10;
4832     $self->{kwd} += $self->{nc} - 0x0030;
4833 wakaba 1.1
4834     ## Stay in the state.
4835    
4836     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4837     $self->{line_prev} = $self->{line};
4838     $self->{column_prev} = $self->{column};
4839     $self->{column}++;
4840     $self->{nc}
4841     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4842     } else {
4843     $self->{set_nc}->($self);
4844     }
4845    
4846     redo A;
4847     } elsif ($self->{nc} == 0x003B) { # ;
4848    
4849    
4850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4851     $self->{line_prev} = $self->{line};
4852     $self->{column_prev} = $self->{column};
4853     $self->{column}++;
4854     $self->{nc}
4855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4856     } else {
4857     $self->{set_nc}->($self);
4858     }
4859    
4860     #
4861     } else {
4862    
4863     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4864     ## Reconsume.
4865     #
4866     }
4867    
4868 wakaba 1.12 my $code = $self->{kwd};
4869 wakaba 1.1 my $l = $self->{line_prev};
4870     my $c = $self->{column_prev};
4871 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4872     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4873     ($self->{is_xml} and $code == 0x0000)) {
4874 wakaba 1.1
4875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4876     text => (sprintf 'U+%04X', $code),
4877     line => $l, column => $c);
4878     $code = $charref_map->{$code};
4879     } elsif ($code > 0x10FFFF) {
4880    
4881     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4882     text => (sprintf 'U-%08X', $code),
4883     line => $l, column => $c);
4884     $code = 0xFFFD;
4885     }
4886    
4887     if ($self->{prev_state} == DATA_STATE) {
4888    
4889     $self->{state} = $self->{prev_state};
4890 wakaba 1.5 $self->{s_kwd} = '';
4891 wakaba 1.1 ## Reconsume.
4892     return ({type => CHARACTER_TOKEN, data => chr $code,
4893 wakaba 1.7 has_reference => 1,
4894 wakaba 1.1 line => $l, column => $c,
4895     });
4896     redo A;
4897     } else {
4898    
4899     $self->{ca}->{value} .= chr $code;
4900     $self->{ca}->{has_reference} = 1;
4901     $self->{state} = $self->{prev_state};
4902 wakaba 1.5 $self->{s_kwd} = '';
4903 wakaba 1.1 ## Reconsume.
4904     redo A;
4905     }
4906     } elsif ($self->{state} == HEXREF_X_STATE) {
4907     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4908     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4909     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4910     # 0..9, A..F, a..f
4911    
4912     $self->{state} = HEXREF_HEX_STATE;
4913 wakaba 1.12 $self->{kwd} = 0;
4914 wakaba 1.1 ## Reconsume.
4915     redo A;
4916     } else {
4917     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4918     line => $self->{line_prev},
4919     column => $self->{column_prev} - 2);
4920    
4921     ## NOTE: According to the spec algorithm, nothing is returned,
4922     ## and then "&#" followed by "X" or "x" is appended to the parent
4923     ## element or the attribute value in the later processing.
4924    
4925     if ($self->{prev_state} == DATA_STATE) {
4926    
4927     $self->{state} = $self->{prev_state};
4928 wakaba 1.5 $self->{s_kwd} = '';
4929 wakaba 1.1 ## Reconsume.
4930     return ({type => CHARACTER_TOKEN,
4931 wakaba 1.12 data => '&' . $self->{kwd},
4932 wakaba 1.1 line => $self->{line_prev},
4933 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4934 wakaba 1.1 });
4935     redo A;
4936     } else {
4937    
4938 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4939 wakaba 1.1 $self->{state} = $self->{prev_state};
4940 wakaba 1.5 $self->{s_kwd} = '';
4941 wakaba 1.1 ## Reconsume.
4942     redo A;
4943     }
4944     }
4945     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4946     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4947     # 0..9
4948    
4949 wakaba 1.12 $self->{kwd} *= 0x10;
4950     $self->{kwd} += $self->{nc} - 0x0030;
4951 wakaba 1.1 ## Stay in the state.
4952    
4953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4954     $self->{line_prev} = $self->{line};
4955     $self->{column_prev} = $self->{column};
4956     $self->{column}++;
4957     $self->{nc}
4958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4959     } else {
4960     $self->{set_nc}->($self);
4961     }
4962    
4963     redo A;
4964     } elsif (0x0061 <= $self->{nc} and
4965     $self->{nc} <= 0x0066) { # a..f
4966    
4967 wakaba 1.12 $self->{kwd} *= 0x10;
4968     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4969 wakaba 1.1 ## Stay in the state.
4970    
4971     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4972     $self->{line_prev} = $self->{line};
4973     $self->{column_prev} = $self->{column};
4974     $self->{column}++;
4975     $self->{nc}
4976     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4977     } else {
4978     $self->{set_nc}->($self);
4979     }
4980    
4981     redo A;
4982     } elsif (0x0041 <= $self->{nc} and
4983     $self->{nc} <= 0x0046) { # A..F
4984    
4985 wakaba 1.12 $self->{kwd} *= 0x10;
4986     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4987 wakaba 1.1 ## Stay in the state.
4988    
4989     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4990     $self->{line_prev} = $self->{line};
4991     $self->{column_prev} = $self->{column};
4992     $self->{column}++;
4993     $self->{nc}
4994     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4995     } else {
4996     $self->{set_nc}->($self);
4997     }
4998    
4999     redo A;
5000     } elsif ($self->{nc} == 0x003B) { # ;
5001    
5002    
5003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5004     $self->{line_prev} = $self->{line};
5005     $self->{column_prev} = $self->{column};
5006     $self->{column}++;
5007     $self->{nc}
5008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5009     } else {
5010     $self->{set_nc}->($self);
5011     }
5012    
5013     #
5014     } else {
5015    
5016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5017     line => $self->{line},
5018     column => $self->{column});
5019     ## Reconsume.
5020     #
5021     }
5022    
5023 wakaba 1.12 my $code = $self->{kwd};
5024 wakaba 1.1 my $l = $self->{line_prev};
5025     my $c = $self->{column_prev};
5026 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5027     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5028     ($self->{is_xml} and $code == 0x0000)) {
5029 wakaba 1.1
5030     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5031     text => (sprintf 'U+%04X', $code),
5032     line => $l, column => $c);
5033     $code = $charref_map->{$code};
5034     } elsif ($code > 0x10FFFF) {
5035    
5036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5037     text => (sprintf 'U-%08X', $code),
5038     line => $l, column => $c);
5039     $code = 0xFFFD;
5040     }
5041    
5042     if ($self->{prev_state} == DATA_STATE) {
5043    
5044     $self->{state} = $self->{prev_state};
5045 wakaba 1.5 $self->{s_kwd} = '';
5046 wakaba 1.1 ## Reconsume.
5047     return ({type => CHARACTER_TOKEN, data => chr $code,
5048 wakaba 1.7 has_reference => 1,
5049 wakaba 1.1 line => $l, column => $c,
5050     });
5051     redo A;
5052     } else {
5053    
5054     $self->{ca}->{value} .= chr $code;
5055     $self->{ca}->{has_reference} = 1;
5056     $self->{state} = $self->{prev_state};
5057 wakaba 1.5 $self->{s_kwd} = '';
5058 wakaba 1.1 ## Reconsume.
5059     redo A;
5060     }
5061     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5062 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5063     $self->{nc} <= 0x005A) or # x
5064     (0x0061 <= $self->{nc} and # a
5065     $self->{nc} <= 0x007A) or # z
5066     (0x0030 <= $self->{nc} and # 0
5067     $self->{nc} <= 0x0039) or # 9
5068 wakaba 1.22 $self->{nc} == 0x003B or # ;
5069     ($self->{is_xml} and
5070     not ($is_space->{$self->{nc}} or
5071     {
5072     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5073     $self->{entity_add} => 1,
5074     }->{$self->{nc}}))) {
5075 wakaba 1.1 our $EntityChar;
5076 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5077 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5078     $self->{ge}->{$self->{kwd}}) {
5079 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5080 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5081     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5082    
5083     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5084     } else {
5085     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5086    
5087     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5088     value => $self->{kwd});
5089     } else {
5090    
5091     }
5092     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5093     }
5094     } else {
5095     if ($self->{is_xml}) {
5096    
5097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5098     value => $self->{kwd},
5099     level => {
5100     'amp;' => $self->{level}->{warn},
5101     'quot;' => $self->{level}->{warn},
5102     'lt;' => $self->{level}->{warn},
5103     'gt;' => $self->{level}->{warn},
5104     'apos;' => $self->{level}->{warn},
5105     }->{$self->{kwd}} ||
5106     $self->{level}->{must});
5107     } else {
5108    
5109     }
5110     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5111     }
5112 wakaba 1.1 $self->{entity__match} = 1;
5113    
5114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5115     $self->{line_prev} = $self->{line};
5116     $self->{column_prev} = $self->{column};
5117     $self->{column}++;
5118     $self->{nc}
5119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5120     } else {
5121     $self->{set_nc}->($self);
5122     }
5123    
5124     #
5125     } else {
5126    
5127 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5128 wakaba 1.1 $self->{entity__match} = -1;
5129     ## Stay in the state.
5130    
5131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5132     $self->{line_prev} = $self->{line};
5133     $self->{column_prev} = $self->{column};
5134     $self->{column}++;
5135     $self->{nc}
5136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5137     } else {
5138     $self->{set_nc}->($self);
5139     }
5140    
5141     redo A;
5142     }
5143     } else {
5144    
5145     $self->{entity__value} .= chr $self->{nc};
5146     $self->{entity__match} *= 2;
5147     ## Stay in the state.
5148    
5149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5150     $self->{line_prev} = $self->{line};
5151     $self->{column_prev} = $self->{column};
5152     $self->{column}++;
5153     $self->{nc}
5154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5155     } else {
5156     $self->{set_nc}->($self);
5157     }
5158    
5159     redo A;
5160     }
5161     }
5162    
5163     my $data;
5164     my $has_ref;
5165     if ($self->{entity__match} > 0) {
5166    
5167     $data = $self->{entity__value};
5168     $has_ref = 1;
5169     #
5170     } elsif ($self->{entity__match} < 0) {
5171     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5172     if ($self->{prev_state} != DATA_STATE and # in attribute
5173     $self->{entity__match} < -1) {
5174    
5175 wakaba 1.12 $data = '&' . $self->{kwd};
5176 wakaba 1.1 #
5177     } else {
5178    
5179     $data = $self->{entity__value};
5180     $has_ref = 1;
5181     #
5182     }
5183     } else {
5184    
5185     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5186     line => $self->{line_prev},
5187 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5188     $data = '&' . $self->{kwd};
5189 wakaba 1.1 #
5190     }
5191    
5192     ## NOTE: In these cases, when a character reference is found,
5193     ## it is consumed and a character token is returned, or, otherwise,
5194     ## nothing is consumed and returned, according to the spec algorithm.
5195     ## In this implementation, anything that has been examined by the
5196     ## tokenizer is appended to the parent element or the attribute value
5197     ## as string, either literal string when no character reference or
5198     ## entity-replaced string otherwise, in this stage, since any characters
5199     ## that would not be consumed are appended in the data state or in an
5200     ## appropriate attribute value state anyway.
5201    
5202     if ($self->{prev_state} == DATA_STATE) {
5203    
5204     $self->{state} = $self->{prev_state};
5205 wakaba 1.5 $self->{s_kwd} = '';
5206 wakaba 1.1 ## Reconsume.
5207     return ({type => CHARACTER_TOKEN,
5208     data => $data,
5209 wakaba 1.7 has_reference => $has_ref,
5210 wakaba 1.1 line => $self->{line_prev},
5211 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5212 wakaba 1.1 });
5213     redo A;
5214     } else {
5215    
5216     $self->{ca}->{value} .= $data;
5217     $self->{ca}->{has_reference} = 1 if $has_ref;
5218     $self->{state} = $self->{prev_state};
5219 wakaba 1.5 $self->{s_kwd} = '';
5220 wakaba 1.1 ## Reconsume.
5221     redo A;
5222     }
5223 wakaba 1.8
5224     ## XML-only states
5225    
5226     } elsif ($self->{state} == PI_STATE) {
5227 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5228    
5229 wakaba 1.8 if ($is_space->{$self->{nc}} or
5230 wakaba 1.14 $self->{nc} == 0x003F or # ?
5231 wakaba 1.8 $self->{nc} == -1) {
5232 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5233     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5234     ## "DOCTYPE pi state": Parse error, switch to the "data
5235     ## state".
5236 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5237     line => $self->{line_prev},
5238     column => $self->{column_prev}
5239     - 1 * ($self->{nc} != -1));
5240     $self->{state} = BOGUS_COMMENT_STATE;
5241     ## Reconsume.
5242     $self->{ct} = {type => COMMENT_TOKEN,
5243     data => '?',
5244     line => $self->{line_prev},
5245     column => $self->{column_prev}
5246     - 1 * ($self->{nc} != -1),
5247     };
5248     redo A;
5249     } else {
5250 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5251 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5252     target => chr $self->{nc},
5253     data => '',
5254     line => $self->{line_prev},
5255     column => $self->{column_prev} - 1,
5256     };
5257     $self->{state} = PI_TARGET_STATE;
5258    
5259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5260     $self->{line_prev} = $self->{line};
5261     $self->{column_prev} = $self->{column};
5262     $self->{column}++;
5263     $self->{nc}
5264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5265     } else {
5266     $self->{set_nc}->($self);
5267     }
5268    
5269     redo A;
5270     }
5271     } elsif ($self->{state} == PI_TARGET_STATE) {
5272     if ($is_space->{$self->{nc}}) {
5273     $self->{state} = PI_TARGET_AFTER_STATE;
5274    
5275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5276     $self->{line_prev} = $self->{line};
5277     $self->{column_prev} = $self->{column};
5278     $self->{column}++;
5279     $self->{nc}
5280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5281     } else {
5282     $self->{set_nc}->($self);
5283     }
5284    
5285     redo A;
5286     } elsif ($self->{nc} == -1) {
5287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5288 wakaba 1.13 if ($self->{in_subset}) {
5289     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5290     } else {
5291     $self->{state} = DATA_STATE;
5292     $self->{s_kwd} = '';
5293     }
5294 wakaba 1.8 ## Reconsume.
5295     return ($self->{ct}); # pi
5296     redo A;
5297     } elsif ($self->{nc} == 0x003F) { # ?
5298     $self->{state} = PI_AFTER_STATE;
5299    
5300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5301     $self->{line_prev} = $self->{line};
5302     $self->{column_prev} = $self->{column};
5303     $self->{column}++;
5304     $self->{nc}
5305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5306     } else {
5307     $self->{set_nc}->($self);
5308     }
5309    
5310     redo A;
5311     } else {
5312     ## XML5: typo ("tag name" -> "target")
5313     $self->{ct}->{target} .= chr $self->{nc}; # pi
5314    
5315     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5316     $self->{line_prev} = $self->{line};
5317     $self->{column_prev} = $self->{column};
5318     $self->{column}++;
5319     $self->{nc}
5320     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5321     } else {
5322     $self->{set_nc}->($self);
5323     }
5324    
5325     redo A;
5326     }
5327     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5328     if ($is_space->{$self->{nc}}) {
5329     ## Stay in the state.
5330    
5331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5332     $self->{line_prev} = $self->{line};
5333     $self->{column_prev} = $self->{column};
5334     $self->{column}++;
5335     $self->{nc}
5336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5337     } else {
5338     $self->{set_nc}->($self);
5339     }
5340    
5341     redo A;
5342     } else {
5343     $self->{state} = PI_DATA_STATE;
5344     ## Reprocess.
5345     redo A;
5346     }
5347     } elsif ($self->{state} == PI_DATA_STATE) {
5348     if ($self->{nc} == 0x003F) { # ?
5349     $self->{state} = PI_DATA_AFTER_STATE;
5350    
5351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5352     $self->{line_prev} = $self->{line};
5353     $self->{column_prev} = $self->{column};
5354     $self->{column}++;
5355     $self->{nc}
5356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5357     } else {
5358     $self->{set_nc}->($self);
5359     }
5360    
5361     redo A;
5362     } elsif ($self->{nc} == -1) {
5363     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5364 wakaba 1.13 if ($self->{in_subset}) {
5365 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5366 wakaba 1.13 } else {
5367     $self->{state} = DATA_STATE;
5368     $self->{s_kwd} = '';
5369     }
5370 wakaba 1.8 ## Reprocess.
5371     return ($self->{ct}); # pi
5372     redo A;
5373     } else {
5374     $self->{ct}->{data} .= chr $self->{nc}; # pi
5375     $self->{read_until}->($self->{ct}->{data}, q[?],
5376     length $self->{ct}->{data});
5377     ## Stay in the state.
5378    
5379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5380     $self->{line_prev} = $self->{line};
5381     $self->{column_prev} = $self->{column};
5382     $self->{column}++;
5383     $self->{nc}
5384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5385     } else {
5386     $self->{set_nc}->($self);
5387     }
5388    
5389     ## Reprocess.
5390     redo A;
5391     }
5392     } elsif ($self->{state} == PI_AFTER_STATE) {
5393 wakaba 1.14 ## XML5: Part of "Pi after state".
5394    
5395 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5396 wakaba 1.13 if ($self->{in_subset}) {
5397     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5398     } else {
5399     $self->{state} = DATA_STATE;
5400     $self->{s_kwd} = '';
5401     }
5402 wakaba 1.8
5403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5404     $self->{line_prev} = $self->{line};
5405     $self->{column_prev} = $self->{column};
5406     $self->{column}++;
5407     $self->{nc}
5408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5409     } else {
5410     $self->{set_nc}->($self);
5411     }
5412    
5413     return ($self->{ct}); # pi
5414     redo A;
5415     } elsif ($self->{nc} == 0x003F) { # ?
5416     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5417     line => $self->{line_prev},
5418     column => $self->{column_prev}); ## XML5: no error
5419     $self->{ct}->{data} .= '?';
5420     $self->{state} = PI_DATA_AFTER_STATE;
5421    
5422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5423     $self->{line_prev} = $self->{line};
5424     $self->{column_prev} = $self->{column};
5425     $self->{column}++;
5426     $self->{nc}
5427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5428     } else {
5429     $self->{set_nc}->($self);
5430     }
5431    
5432     redo A;
5433     } else {
5434     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5435     line => $self->{line_prev},
5436     column => $self->{column_prev}
5437     + 1 * ($self->{nc} == -1)); ## XML5: no error
5438     $self->{ct}->{data} .= '?'; ## XML5: not appended
5439     $self->{state} = PI_DATA_STATE;
5440     ## Reprocess.
5441     redo A;
5442     }
5443     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5444 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5445    
5446 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5447 wakaba 1.13 if ($self->{in_subset}) {
5448     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5449     } else {
5450     $self->{state} = DATA_STATE;
5451     $self->{s_kwd} = '';
5452     }
5453 wakaba 1.8
5454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5455     $self->{line_prev} = $self->{line};
5456     $self->{column_prev} = $self->{column};
5457     $self->{column}++;
5458     $self->{nc}
5459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5460     } else {
5461     $self->{set_nc}->($self);
5462     }
5463    
5464     return ($self->{ct}); # pi
5465     redo A;
5466     } elsif ($self->{nc} == 0x003F) { # ?
5467     $self->{ct}->{data} .= '?';
5468     ## Stay in the state.
5469    
5470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5471     $self->{line_prev} = $self->{line};
5472     $self->{column_prev} = $self->{column};
5473     $self->{column}++;
5474     $self->{nc}
5475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5476     } else {
5477     $self->{set_nc}->($self);
5478     }
5479    
5480     redo A;
5481     } else {
5482     $self->{ct}->{data} .= '?'; ## XML5: not appended
5483     $self->{state} = PI_DATA_STATE;
5484     ## Reprocess.
5485     redo A;
5486     }
5487 wakaba 1.12
5488     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5489     if ($self->{nc} == 0x003C) { # <
5490 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5491 wakaba 1.12
5492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5493     $self->{line_prev} = $self->{line};
5494     $self->{column_prev} = $self->{column};
5495     $self->{column}++;
5496     $self->{nc}
5497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5498     } else {
5499     $self->{set_nc}->($self);
5500     }
5501    
5502     redo A;
5503     } elsif ($self->{nc} == 0x0025) { # %
5504     ## XML5: Not defined yet.
5505    
5506     ## TODO:
5507 wakaba 1.24
5508     if (not $self->{stop_processing} and
5509     not $self->{document}->xml_standalone) {
5510     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5511     level => $self->{level}->{info});
5512     $self->{stop_processing} = 1;
5513     }
5514    
5515 wakaba 1.12
5516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5517     $self->{line_prev} = $self->{line};
5518     $self->{column_prev} = $self->{column};
5519     $self->{column}++;
5520     $self->{nc}
5521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5522     } else {
5523     $self->{set_nc}->($self);
5524     }
5525    
5526     redo A;
5527     } elsif ($self->{nc} == 0x005D) { # ]
5528 wakaba 1.13 delete $self->{in_subset};
5529 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5530    
5531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5532     $self->{line_prev} = $self->{line};
5533     $self->{column_prev} = $self->{column};
5534     $self->{column}++;
5535     $self->{nc}
5536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5537     } else {
5538     $self->{set_nc}->($self);
5539     }
5540    
5541     redo A;
5542     } elsif ($is_space->{$self->{nc}}) {
5543     ## Stay in the state.
5544    
5545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5546     $self->{line_prev} = $self->{line};
5547     $self->{column_prev} = $self->{column};
5548     $self->{column}++;
5549     $self->{nc}
5550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5551     } else {
5552     $self->{set_nc}->($self);
5553     }
5554    
5555     redo A;
5556     } elsif ($self->{nc} == -1) {
5557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5558 wakaba 1.13 delete $self->{in_subset};
5559 wakaba 1.12 $self->{state} = DATA_STATE;
5560     $self->{s_kwd} = '';
5561     ## Reconsume.
5562 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5563 wakaba 1.12 redo A;
5564     } else {
5565     unless ($self->{internal_subset_tainted}) {
5566     ## XML5: No parse error.
5567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5568     $self->{internal_subset_tainted} = 1;
5569     }
5570     ## Stay in the state.
5571    
5572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5573     $self->{line_prev} = $self->{line};
5574     $self->{column_prev} = $self->{column};
5575     $self->{column}++;
5576     $self->{nc}
5577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5578     } else {
5579     $self->{set_nc}->($self);
5580     }
5581    
5582     redo A;
5583     }
5584     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5585     if ($self->{nc} == 0x003E) { # >
5586     $self->{state} = DATA_STATE;
5587     $self->{s_kwd} = '';
5588    
5589     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5590     $self->{line_prev} = $self->{line};
5591     $self->{column_prev} = $self->{column};
5592     $self->{column}++;
5593     $self->{nc}
5594     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5595     } else {
5596     $self->{set_nc}->($self);
5597     }
5598    
5599 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5600 wakaba 1.12 redo A;
5601     } elsif ($self->{nc} == -1) {
5602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5603     $self->{state} = DATA_STATE;
5604     $self->{s_kwd} = '';
5605     ## Reconsume.
5606 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5607 wakaba 1.12 redo A;
5608     } else {
5609     ## XML5: No parse error and stay in the state.
5610     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5611    
5612 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5613    
5614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615     $self->{line_prev} = $self->{line};
5616     $self->{column_prev} = $self->{column};
5617     $self->{column}++;
5618     $self->{nc}
5619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620     } else {
5621     $self->{set_nc}->($self);
5622     }
5623    
5624     redo A;
5625     }
5626     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5627     if ($self->{nc} == 0x003E) { # >
5628     $self->{state} = DATA_STATE;
5629     $self->{s_kwd} = '';
5630    
5631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5632     $self->{line_prev} = $self->{line};
5633     $self->{column_prev} = $self->{column};
5634     $self->{column}++;
5635     $self->{nc}
5636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5637     } else {
5638     $self->{set_nc}->($self);
5639     }
5640    
5641     return ({type => END_OF_DOCTYPE_TOKEN});
5642     redo A;
5643     } elsif ($self->{nc} == -1) {
5644     $self->{state} = DATA_STATE;
5645     $self->{s_kwd} = '';
5646     ## Reconsume.
5647     return ({type => END_OF_DOCTYPE_TOKEN});
5648     redo A;
5649     } else {
5650     ## Stay in the state.
5651    
5652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5653     $self->{line_prev} = $self->{line};
5654     $self->{column_prev} = $self->{column};
5655     $self->{column}++;
5656     $self->{nc}
5657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5658     } else {
5659     $self->{set_nc}->($self);
5660     }
5661    
5662     redo A;
5663     }
5664     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5665     if ($self->{nc} == 0x0021) { # !
5666 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5667 wakaba 1.13
5668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669     $self->{line_prev} = $self->{line};
5670     $self->{column_prev} = $self->{column};
5671     $self->{column}++;
5672     $self->{nc}
5673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674     } else {
5675     $self->{set_nc}->($self);
5676     }
5677    
5678     redo A;
5679     } elsif ($self->{nc} == 0x003F) { # ?
5680     $self->{state} = PI_STATE;
5681    
5682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5683     $self->{line_prev} = $self->{line};
5684     $self->{column_prev} = $self->{column};
5685     $self->{column}++;
5686     $self->{nc}
5687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5688     } else {
5689     $self->{set_nc}->($self);
5690     }
5691    
5692     redo A;
5693     } elsif ($self->{nc} == -1) {
5694     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5695     $self->{state} = DATA_STATE;
5696     $self->{s_kwd} = '';
5697     ## Reconsume.
5698     redo A;
5699     } else {
5700     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5701     line => $self->{line_prev},
5702     column => $self->{column_prev});
5703     $self->{state} = BOGUS_COMMENT_STATE;
5704     $self->{ct} = {type => COMMENT_TOKEN,
5705     data => '',
5706     }; ## NOTE: Will be discarded.
5707 wakaba 1.12
5708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5709     $self->{line_prev} = $self->{line};
5710     $self->{column_prev} = $self->{column};
5711     $self->{column}++;
5712     $self->{nc}
5713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5714     } else {
5715     $self->{set_nc}->($self);
5716     }
5717    
5718     redo A;
5719     }
5720 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5721     ## XML5: "DOCTYPE markup declaration state".
5722    
5723     if ($self->{nc} == 0x002D) { # -
5724     $self->{state} = MD_HYPHEN_STATE;
5725    
5726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5727     $self->{line_prev} = $self->{line};
5728     $self->{column_prev} = $self->{column};
5729     $self->{column}++;
5730     $self->{nc}
5731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5732     } else {
5733     $self->{set_nc}->($self);
5734     }
5735    
5736     redo A;
5737 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5738     $self->{nc} == 0x0065) { # e
5739 wakaba 1.14 $self->{state} = MD_E_STATE;
5740     $self->{kwd} = chr $self->{nc};
5741    
5742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743     $self->{line_prev} = $self->{line};
5744     $self->{column_prev} = $self->{column};
5745     $self->{column}++;
5746     $self->{nc}
5747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748     } else {
5749     $self->{set_nc}->($self);
5750     }
5751    
5752     redo A;
5753 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5754     $self->{nc} == 0x0061) { # a
5755 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5756     $self->{kwd} = chr $self->{nc};
5757    
5758     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5759     $self->{line_prev} = $self->{line};
5760     $self->{column_prev} = $self->{column};
5761     $self->{column}++;
5762     $self->{nc}
5763     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5764     } else {
5765     $self->{set_nc}->($self);
5766     }
5767    
5768     redo A;
5769 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5770     $self->{nc} == 0x006E) { # n
5771 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5772     $self->{kwd} = chr $self->{nc};
5773    
5774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5775     $self->{line_prev} = $self->{line};
5776     $self->{column_prev} = $self->{column};
5777     $self->{column}++;
5778     $self->{nc}
5779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5780     } else {
5781     $self->{set_nc}->($self);
5782     }
5783    
5784     redo A;
5785     } else {
5786     #
5787     }
5788    
5789     ## XML5: No parse error.
5790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5791     line => $self->{line_prev},
5792     column => $self->{column_prev} - 1);
5793     ## Reconsume.
5794     $self->{state} = BOGUS_COMMENT_STATE;
5795     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5796     redo A;
5797     } elsif ($self->{state} == MD_E_STATE) {
5798 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5799     $self->{nc} == 0x006E) { # n
5800 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5801     $self->{kwd} .= chr $self->{nc};
5802    
5803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5804     $self->{line_prev} = $self->{line};
5805     $self->{column_prev} = $self->{column};
5806     $self->{column}++;
5807     $self->{nc}
5808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5809     } else {
5810     $self->{set_nc}->($self);
5811     }
5812    
5813     redo A;
5814 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5815     $self->{nc} == 0x006C) { # l
5816 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5817     $self->{state} = MD_ELEMENT_STATE;
5818     $self->{kwd} .= chr $self->{nc};
5819    
5820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5821     $self->{line_prev} = $self->{line};
5822     $self->{column_prev} = $self->{column};
5823     $self->{column}++;
5824     $self->{nc}
5825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5826     } else {
5827     $self->{set_nc}->($self);
5828     }
5829    
5830     redo A;
5831     } else {
5832     ## XML5: No parse error.
5833     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5834     line => $self->{line_prev},
5835     column => $self->{column_prev} - 2
5836     + 1 * ($self->{nc} == -1));
5837     ## Reconsume.
5838     $self->{state} = BOGUS_COMMENT_STATE;
5839     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5840     redo A;
5841     }
5842     } elsif ($self->{state} == MD_ENTITY_STATE) {
5843 wakaba 1.17 if ($self->{nc} == [
5844     undef,
5845     undef,
5846     0x0054, # T
5847     0x0049, # I
5848     0x0054, # T
5849     ]->[length $self->{kwd}] or
5850     $self->{nc} == [
5851     undef,
5852     undef,
5853     0x0074, # t
5854     0x0069, # i
5855     0x0074, # t
5856     ]->[length $self->{kwd}]) {
5857 wakaba 1.14 ## Stay in the state.
5858     $self->{kwd} .= chr $self->{nc};
5859    
5860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5861     $self->{line_prev} = $self->{line};
5862     $self->{column_prev} = $self->{column};
5863     $self->{column}++;
5864     $self->{nc}
5865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5866     } else {
5867     $self->{set_nc}->($self);
5868     }
5869    
5870     redo A;
5871 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5872     ($self->{nc} == 0x0059 or # Y
5873     $self->{nc} == 0x0079)) { # y
5874     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5876     text => 'ENTITY',
5877     line => $self->{line_prev},
5878     column => $self->{column_prev} - 4);
5879     }
5880     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5881 wakaba 1.14 line => $self->{line_prev},
5882     column => $self->{column_prev} - 6};
5883     $self->{state} = DOCTYPE_MD_STATE;
5884    
5885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5886     $self->{line_prev} = $self->{line};
5887     $self->{column_prev} = $self->{column};
5888     $self->{column}++;
5889     $self->{nc}
5890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5891     } else {
5892     $self->{set_nc}->($self);
5893     }
5894    
5895     redo A;
5896     } else {
5897     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5898     line => $self->{line_prev},
5899     column => $self->{column_prev} - 1
5900     - (length $self->{kwd})
5901     + 1 * ($self->{nc} == -1));
5902     $self->{state} = BOGUS_COMMENT_STATE;
5903     ## Reconsume.
5904     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5905     redo A;
5906     }
5907     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5908 wakaba 1.17 if ($self->{nc} == [
5909     undef,
5910     undef,
5911     0x0045, # E
5912     0x004D, # M
5913     0x0045, # E
5914     0x004E, # N
5915     ]->[length $self->{kwd}] or
5916     $self->{nc} == [
5917     undef,
5918     undef,
5919     0x0065, # e
5920     0x006D, # m
5921     0x0065, # e
5922     0x006E, # n
5923     ]->[length $self->{kwd}]) {
5924 wakaba 1.14 ## Stay in the state.
5925     $self->{kwd} .= chr $self->{nc};
5926    
5927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5928     $self->{line_prev} = $self->{line};
5929     $self->{column_prev} = $self->{column};
5930     $self->{column}++;
5931     $self->{nc}
5932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5933     } else {
5934     $self->{set_nc}->($self);
5935     }
5936    
5937     redo A;
5938 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5939     ($self->{nc} == 0x0054 or # T
5940     $self->{nc} == 0x0074)) { # t
5941     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5942     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5943     text => 'ELEMENT',
5944     line => $self->{line_prev},
5945     column => $self->{column_prev} - 5);
5946     }
5947 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5948     line => $self->{line_prev},
5949 wakaba 1.23 column => $self->{column_prev} - 7};
5950 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
5951    
5952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5953     $self->{line_prev} = $self->{line};
5954     $self->{column_prev} = $self->{column};
5955     $self->{column}++;
5956     $self->{nc}
5957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5958     } else {
5959     $self->{set_nc}->($self);
5960     }
5961    
5962     redo A;
5963     } else {
5964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5965     line => $self->{line_prev},
5966     column => $self->{column_prev} - 1
5967     - (length $self->{kwd})
5968     + 1 * ($self->{nc} == -1));
5969     $self->{state} = BOGUS_COMMENT_STATE;
5970     ## Reconsume.
5971     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5972     redo A;
5973     }
5974     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5975 wakaba 1.17 if ($self->{nc} == [
5976     undef,
5977     0x0054, # T
5978     0x0054, # T
5979     0x004C, # L
5980     0x0049, # I
5981     0x0053, # S
5982     ]->[length $self->{kwd}] or
5983     $self->{nc} == [
5984     undef,
5985     0x0074, # t
5986     0x0074, # t
5987     0x006C, # l
5988     0x0069, # i
5989     0x0073, # s
5990     ]->[length $self->{kwd}]) {
5991 wakaba 1.14 ## Stay in the state.
5992     $self->{kwd} .= chr $self->{nc};
5993    
5994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5995     $self->{line_prev} = $self->{line};
5996     $self->{column_prev} = $self->{column};
5997     $self->{column}++;
5998     $self->{nc}
5999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6000     } else {
6001     $self->{set_nc}->($self);
6002     }
6003    
6004     redo A;
6005 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6006     ($self->{nc} == 0x0054 or # T
6007     $self->{nc} == 0x0074)) { # t
6008     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6010     text => 'ATTLIST',
6011     line => $self->{line_prev},
6012     column => $self->{column_prev} - 5);
6013     }
6014 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6015 wakaba 1.15 attrdefs => [],
6016 wakaba 1.14 line => $self->{line_prev},
6017 wakaba 1.23 column => $self->{column_prev} - 7};
6018 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6019    
6020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6021     $self->{line_prev} = $self->{line};
6022     $self->{column_prev} = $self->{column};
6023     $self->{column}++;
6024     $self->{nc}
6025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6026     } else {
6027     $self->{set_nc}->($self);
6028     }
6029    
6030     redo A;
6031     } else {
6032     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6033     line => $self->{line_prev},
6034     column => $self->{column_prev} - 1
6035     - (length $self->{kwd})
6036     + 1 * ($self->{nc} == -1));
6037     $self->{state} = BOGUS_COMMENT_STATE;
6038     ## Reconsume.
6039     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6040     redo A;
6041     }
6042     } elsif ($self->{state} == MD_NOTATION_STATE) {
6043 wakaba 1.17 if ($self->{nc} == [
6044     undef,
6045     0x004F, # O
6046     0x0054, # T
6047     0x0041, # A
6048     0x0054, # T
6049     0x0049, # I
6050     0x004F, # O
6051     ]->[length $self->{kwd}] or
6052     $self->{nc} == [
6053     undef,
6054     0x006F, # o
6055     0x0074, # t
6056     0x0061, # a
6057     0x0074, # t
6058     0x0069, # i
6059     0x006F, # o
6060     ]->[length $self->{kwd}]) {
6061 wakaba 1.14 ## Stay in the state.
6062     $self->{kwd} .= chr $self->{nc};
6063    
6064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6065     $self->{line_prev} = $self->{line};
6066     $self->{column_prev} = $self->{column};
6067     $self->{column}++;
6068     $self->{nc}
6069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6070     } else {
6071     $self->{set_nc}->($self);
6072     }
6073    
6074     redo A;
6075 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6076     ($self->{nc} == 0x004E or # N
6077     $self->{nc} == 0x006E)) { # n
6078     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6079     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6080     text => 'NOTATION',
6081     line => $self->{line_prev},
6082     column => $self->{column_prev} - 6);
6083     }
6084 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6085     line => $self->{line_prev},
6086 wakaba 1.23 column => $self->{column_prev} - 8};
6087 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6088    
6089     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6090     $self->{line_prev} = $self->{line};
6091     $self->{column_prev} = $self->{column};
6092     $self->{column}++;
6093     $self->{nc}
6094     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6095     } else {
6096     $self->{set_nc}->($self);
6097     }
6098    
6099     redo A;
6100     } else {
6101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6102     line => $self->{line_prev},
6103     column => $self->{column_prev} - 1
6104     - (length $self->{kwd})
6105     + 1 * ($self->{nc} == -1));
6106     $self->{state} = BOGUS_COMMENT_STATE;
6107     ## Reconsume.
6108     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6109     redo A;
6110     }
6111     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6112     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6113     ## "DOCTYPE NOTATION state".
6114    
6115     if ($is_space->{$self->{nc}}) {
6116     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6117     $self->{state} = BEFORE_MD_NAME_STATE;
6118    
6119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6120     $self->{line_prev} = $self->{line};
6121     $self->{column_prev} = $self->{column};
6122     $self->{column}++;
6123     $self->{nc}
6124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6125     } else {
6126     $self->{set_nc}->($self);
6127     }
6128    
6129     redo A;
6130     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6131     $self->{nc} == 0x0025) { # %
6132     ## XML5: Switch to the "DOCTYPE bogus comment state".
6133     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6134     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6135    
6136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6137     $self->{line_prev} = $self->{line};
6138     $self->{column_prev} = $self->{column};
6139     $self->{column}++;
6140     $self->{nc}
6141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6142     } else {
6143     $self->{set_nc}->($self);
6144     }
6145    
6146     redo A;
6147     } elsif ($self->{nc} == -1) {
6148     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6149     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6150     ## Reconsume.
6151     redo A;
6152     } elsif ($self->{nc} == 0x003E) { # >
6153     ## XML5: Switch to the "DOCTYPE bogus comment state".
6154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6156    
6157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6158     $self->{line_prev} = $self->{line};
6159     $self->{column_prev} = $self->{column};
6160     $self->{column}++;
6161     $self->{nc}
6162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6163     } else {
6164     $self->{set_nc}->($self);
6165     }
6166    
6167     redo A;
6168     } else {
6169     ## XML5: Switch to the "DOCTYPE bogus comment state".
6170     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6171     $self->{state} = BEFORE_MD_NAME_STATE;
6172     redo A;
6173     }
6174     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6175     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6176     ## before state", "DOCTYPE ATTLIST name before state".
6177    
6178     if ($is_space->{$self->{nc}}) {
6179     ## Stay in the state.
6180    
6181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6182     $self->{line_prev} = $self->{line};
6183     $self->{column_prev} = $self->{column};
6184     $self->{column}++;
6185     $self->{nc}
6186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6187     } else {
6188     $self->{set_nc}->($self);
6189     }
6190    
6191     redo A;
6192     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6193     $self->{nc} == 0x0025) { # %
6194     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6195    
6196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6197     $self->{line_prev} = $self->{line};
6198     $self->{column_prev} = $self->{column};
6199     $self->{column}++;
6200     $self->{nc}
6201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6202     } else {
6203     $self->{set_nc}->($self);
6204     }
6205    
6206     redo A;
6207     } elsif ($self->{nc} == 0x003E) { # >
6208     ## XML5: Same as "Anything else".
6209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6210     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6211    
6212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6213     $self->{line_prev} = $self->{line};
6214     $self->{column_prev} = $self->{column};
6215     $self->{column}++;
6216     $self->{nc}
6217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6218     } else {
6219     $self->{set_nc}->($self);
6220     }
6221    
6222     redo A;
6223     } elsif ($self->{nc} == -1) {
6224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6226     ## Reconsume.
6227     redo A;
6228     } else {
6229     ## XML5: [ATTLIST] Not defined yet.
6230     $self->{ct}->{name} .= chr $self->{nc};
6231     $self->{state} = MD_NAME_STATE;
6232    
6233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6234     $self->{line_prev} = $self->{line};
6235     $self->{column_prev} = $self->{column};
6236     $self->{column}++;
6237     $self->{nc}
6238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6239     } else {
6240     $self->{set_nc}->($self);
6241     }
6242    
6243     redo A;
6244     }
6245     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6246     if ($is_space->{$self->{nc}}) {
6247     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6248     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6249     $self->{state} = BEFORE_MD_NAME_STATE;
6250 wakaba 1.8
6251 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6252     $self->{line_prev} = $self->{line};
6253     $self->{column_prev} = $self->{column};
6254     $self->{column}++;
6255     $self->{nc}
6256     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6257     } else {
6258     $self->{set_nc}->($self);
6259     }
6260    
6261     redo A;
6262     } elsif ($self->{nc} == 0x003E) { # >
6263     ## XML5: Same as "Anything else".
6264     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6265     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6266    
6267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6268     $self->{line_prev} = $self->{line};
6269     $self->{column_prev} = $self->{column};
6270     $self->{column}++;
6271     $self->{nc}
6272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6273     } else {
6274     $self->{set_nc}->($self);
6275     }
6276    
6277     redo A;
6278     } elsif ($self->{nc} == -1) {
6279     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6280     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6281     ## Reconsume.
6282     redo A;
6283     } else {
6284     ## XML5: No parse error.
6285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6286     $self->{state} = BOGUS_COMMENT_STATE;
6287     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6288     ## Reconsume.
6289     redo A;
6290     }
6291     } elsif ($self->{state} == MD_NAME_STATE) {
6292     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6293    
6294     if ($is_space->{$self->{nc}}) {
6295 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6296     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6297     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6298 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6299 wakaba 1.16 } else { # ENTITY/NOTATION
6300     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6301     }
6302 wakaba 1.14
6303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304     $self->{line_prev} = $self->{line};
6305     $self->{column_prev} = $self->{column};
6306     $self->{column}++;
6307     $self->{nc}
6308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309     } else {
6310     $self->{set_nc}->($self);
6311     }
6312    
6313     redo A;
6314     } elsif ($self->{nc} == 0x003E) { # >
6315     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6316     #
6317     } else {
6318 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6319 wakaba 1.14 }
6320     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6321    
6322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6323     $self->{line_prev} = $self->{line};
6324     $self->{column_prev} = $self->{column};
6325     $self->{column}++;
6326     $self->{nc}
6327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6328     } else {
6329     $self->{set_nc}->($self);
6330     }
6331    
6332     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6333     redo A;
6334     } elsif ($self->{nc} == -1) {
6335     ## XML5: [ATTLIST] No parse error.
6336     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6337     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6338     ## Reconsume.
6339     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6340     redo A;
6341     } else {
6342     ## XML5: [ATTLIST] Not defined yet.
6343     $self->{ct}->{name} .= chr $self->{nc};
6344     ## Stay in the state.
6345    
6346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6347     $self->{line_prev} = $self->{line};
6348     $self->{column_prev} = $self->{column};
6349     $self->{column}++;
6350     $self->{nc}
6351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6352     } else {
6353     $self->{set_nc}->($self);
6354     }
6355    
6356     redo A;
6357     }
6358     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6359     if ($is_space->{$self->{nc}}) {
6360     ## Stay in the state.
6361    
6362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6363     $self->{line_prev} = $self->{line};
6364     $self->{column_prev} = $self->{column};
6365     $self->{column}++;
6366     $self->{nc}
6367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6368     } else {
6369     $self->{set_nc}->($self);
6370     }
6371    
6372     redo A;
6373     } elsif ($self->{nc} == 0x003E) { # >
6374     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6375    
6376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6377     $self->{line_prev} = $self->{line};
6378     $self->{column_prev} = $self->{column};
6379     $self->{column}++;
6380     $self->{nc}
6381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6382     } else {
6383     $self->{set_nc}->($self);
6384     }
6385    
6386     return ($self->{ct}); # ATTLIST
6387     redo A;
6388     } elsif ($self->{nc} == -1) {
6389     ## XML5: No parse error.
6390     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6392 wakaba 1.15 return ($self->{ct});
6393 wakaba 1.14 redo A;
6394     } else {
6395     ## XML5: Not defined yet.
6396 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6397     tokens => [],
6398     line => $self->{line}, column => $self->{column}};
6399     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6400    
6401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6402     $self->{line_prev} = $self->{line};
6403     $self->{column_prev} = $self->{column};
6404     $self->{column}++;
6405     $self->{nc}
6406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6407     } else {
6408     $self->{set_nc}->($self);
6409     }
6410    
6411     redo A;
6412     }
6413     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6414     if ($is_space->{$self->{nc}}) {
6415     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6416    
6417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6418     $self->{line_prev} = $self->{line};
6419     $self->{column_prev} = $self->{column};
6420     $self->{column}++;
6421     $self->{nc}
6422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6423     } else {
6424     $self->{set_nc}->($self);
6425     }
6426    
6427     redo A;
6428     } elsif ($self->{nc} == 0x003E) { # >
6429     ## XML5: Same as "anything else".
6430     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6431     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6432    
6433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6434     $self->{line_prev} = $self->{line};
6435     $self->{column_prev} = $self->{column};
6436     $self->{column}++;
6437     $self->{nc}
6438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6439     } else {
6440     $self->{set_nc}->($self);
6441     }
6442    
6443     return ($self->{ct}); # ATTLIST
6444     redo A;
6445     } elsif ($self->{nc} == 0x0028) { # (
6446     ## XML5: Same as "anything else".
6447     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6448     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6449    
6450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6451     $self->{line_prev} = $self->{line};
6452     $self->{column_prev} = $self->{column};
6453     $self->{column}++;
6454     $self->{nc}
6455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6456     } else {
6457     $self->{set_nc}->($self);
6458     }
6459    
6460     redo A;
6461     } elsif ($self->{nc} == -1) {
6462     ## XML5: No parse error.
6463     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6464     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6465    
6466     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6467     $self->{line_prev} = $self->{line};
6468     $self->{column_prev} = $self->{column};
6469     $self->{column}++;
6470     $self->{nc}
6471     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6472     } else {
6473     $self->{set_nc}->($self);
6474     }
6475    
6476     return ($self->{ct}); # ATTLIST
6477     redo A;
6478     } else {
6479     ## XML5: Not defined yet.
6480     $self->{ca}->{name} .= chr $self->{nc};
6481     ## Stay in the state.
6482    
6483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6484     $self->{line_prev} = $self->{line};
6485     $self->{column_prev} = $self->{column};
6486     $self->{column}++;
6487     $self->{nc}
6488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6489     } else {
6490     $self->{set_nc}->($self);
6491     }
6492    
6493 wakaba 1.14 redo A;
6494     }
6495 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6496     if ($is_space->{$self->{nc}}) {
6497     ## Stay in the state.
6498    
6499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6500     $self->{line_prev} = $self->{line};
6501     $self->{column_prev} = $self->{column};
6502     $self->{column}++;
6503     $self->{nc}
6504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6505     } else {
6506     $self->{set_nc}->($self);
6507     }
6508    
6509     redo A;
6510     } elsif ($self->{nc} == 0x003E) { # >
6511     ## XML5: Same as "anything else".
6512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6514    
6515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6516     $self->{line_prev} = $self->{line};
6517     $self->{column_prev} = $self->{column};
6518     $self->{column}++;
6519     $self->{nc}
6520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6521     } else {
6522     $self->{set_nc}->($self);
6523     }
6524    
6525     return ($self->{ct}); # ATTLIST
6526     redo A;
6527     } elsif ($self->{nc} == 0x0028) { # (
6528     ## XML5: Same as "anything else".
6529     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6530    
6531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6532     $self->{line_prev} = $self->{line};
6533     $self->{column_prev} = $self->{column};
6534     $self->{column}++;
6535     $self->{nc}
6536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6537     } else {
6538     $self->{set_nc}->($self);
6539     }
6540    
6541     redo A;
6542     } elsif ($self->{nc} == -1) {
6543     ## XML5: No parse error.
6544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6545     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6546    
6547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548     $self->{line_prev} = $self->{line};
6549     $self->{column_prev} = $self->{column};
6550     $self->{column}++;
6551     $self->{nc}
6552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553     } else {
6554     $self->{set_nc}->($self);
6555     }
6556    
6557     return ($self->{ct});
6558     redo A;
6559     } else {
6560     ## XML5: Not defined yet.
6561     $self->{ca}->{type} = chr $self->{nc};
6562     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6563    
6564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565     $self->{line_prev} = $self->{line};
6566     $self->{column_prev} = $self->{column};
6567     $self->{column}++;
6568     $self->{nc}
6569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570     } else {
6571     $self->{set_nc}->($self);
6572     }
6573    
6574     redo A;
6575     }
6576     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6577     if ($is_space->{$self->{nc}}) {
6578     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6579    
6580     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6581     $self->{line_prev} = $self->{line};
6582     $self->{column_prev} = $self->{column};
6583     $self->{column}++;
6584     $self->{nc}
6585     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6586     } else {
6587     $self->{set_nc}->($self);
6588     }
6589    
6590     redo A;
6591     } elsif ($self->{nc} == 0x0023) { # #
6592     ## XML5: Same as "anything else".
6593     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6594     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6595    
6596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6597     $self->{line_prev} = $self->{line};
6598     $self->{column_prev} = $self->{column};
6599     $self->{column}++;
6600     $self->{nc}
6601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6602     } else {
6603     $self->{set_nc}->($self);
6604     }
6605    
6606     redo A;
6607     } elsif ($self->{nc} == 0x0022) { # "
6608     ## XML5: Same as "anything else".
6609     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6610     $self->{ca}->{value} = '';
6611     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6612    
6613     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6614     $self->{line_prev} = $self->{line};
6615     $self->{column_prev} = $self->{column};
6616     $self->{column}++;
6617     $self->{nc}
6618     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6619     } else {
6620     $self->{set_nc}->($self);
6621     }
6622    
6623     redo A;
6624     } elsif ($self->{nc} == 0x0027) { # '
6625     ## XML5: Same as "anything else".
6626     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6627     $self->{ca}->{value} = '';
6628     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6629    
6630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6631     $self->{line_prev} = $self->{line};
6632     $self->{column_prev} = $self->{column};
6633     $self->{column}++;
6634     $self->{nc}
6635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6636     } else {
6637     $self->{set_nc}->($self);
6638     }
6639    
6640     redo A;
6641     } elsif ($self->{nc} == 0x003E) { # >
6642     ## XML5: Same as "anything else".
6643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6644     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6645    
6646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6647     $self->{line_prev} = $self->{line};
6648     $self->{column_prev} = $self->{column};
6649     $self->{column}++;
6650     $self->{nc}
6651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6652     } else {
6653     $self->{set_nc}->($self);
6654     }
6655    
6656     return ($self->{ct}); # ATTLIST
6657     redo A;
6658     } elsif ($self->{nc} == 0x0028) { # (
6659     ## XML5: Same as "anything else".
6660     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6661     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6662    
6663     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6664     $self->{line_prev} = $self->{line};
6665     $self->{column_prev} = $self->{column};
6666     $self->{column}++;
6667     $self->{nc}
6668     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6669     } else {
6670     $self->{set_nc}->($self);
6671     }
6672    
6673     redo A;
6674     } elsif ($self->{nc} == -1) {
6675     ## XML5: No parse error.
6676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6677     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6678    
6679     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6680     $self->{line_prev} = $self->{line};
6681     $self->{column_prev} = $self->{column};
6682     $self->{column}++;
6683     $self->{nc}
6684     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6685     } else {
6686     $self->{set_nc}->($self);
6687     }
6688    
6689     return ($self->{ct});
6690     redo A;
6691     } else {
6692     ## XML5: Not defined yet.
6693     $self->{ca}->{type} .= chr $self->{nc};
6694     ## Stay in the state.
6695    
6696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6697     $self->{line_prev} = $self->{line};
6698     $self->{column_prev} = $self->{column};
6699     $self->{column}++;
6700     $self->{nc}
6701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6702     } else {
6703     $self->{set_nc}->($self);
6704     }
6705    
6706     redo A;
6707     }
6708     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6709     if ($is_space->{$self->{nc}}) {
6710     ## Stay in the state.
6711    
6712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6713     $self->{line_prev} = $self->{line};
6714     $self->{column_prev} = $self->{column};
6715     $self->{column}++;
6716     $self->{nc}
6717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6718     } else {
6719     $self->{set_nc}->($self);
6720     }
6721    
6722     redo A;
6723     } elsif ($self->{nc} == 0x0028) { # (
6724     ## XML5: Same as "anything else".
6725     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6726    
6727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728     $self->{line_prev} = $self->{line};
6729     $self->{column_prev} = $self->{column};
6730     $self->{column}++;
6731     $self->{nc}
6732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733     } else {
6734     $self->{set_nc}->($self);
6735     }
6736    
6737     redo A;
6738     } elsif ($self->{nc} == 0x0023) { # #
6739     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6740    
6741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6742     $self->{line_prev} = $self->{line};
6743     $self->{column_prev} = $self->{column};
6744     $self->{column}++;
6745     $self->{nc}
6746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6747     } else {
6748     $self->{set_nc}->($self);
6749     }
6750    
6751     redo A;
6752     } elsif ($self->{nc} == 0x0022) { # "
6753     ## XML5: Same as "anything else".
6754     $self->{ca}->{value} = '';
6755     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6756    
6757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6758     $self->{line_prev} = $self->{line};
6759     $self->{column_prev} = $self->{column};
6760     $self->{column}++;
6761     $self->{nc}
6762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6763     } else {
6764     $self->{set_nc}->($self);
6765     }
6766    
6767     redo A;
6768     } elsif ($self->{nc} == 0x0027) { # '
6769     ## XML5: Same as "anything else".
6770     $self->{ca}->{value} = '';
6771     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6772    
6773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6774     $self->{line_prev} = $self->{line};
6775     $self->{column_prev} = $self->{column};
6776     $self->{column}++;
6777     $self->{nc}
6778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6779     } else {
6780     $self->{set_nc}->($self);
6781     }
6782    
6783     redo A;
6784     } elsif ($self->{nc} == 0x003E) { # >
6785     ## XML5: Same as "anything else".
6786     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6787     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6788    
6789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790     $self->{line_prev} = $self->{line};
6791     $self->{column_prev} = $self->{column};
6792     $self->{column}++;
6793     $self->{nc}
6794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795     } else {
6796     $self->{set_nc}->($self);
6797     }
6798    
6799     return ($self->{ct}); # ATTLIST
6800     redo A;
6801     } elsif ($self->{nc} == -1) {
6802     ## XML5: No parse error.
6803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6804     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6805    
6806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6807     $self->{line_prev} = $self->{line};
6808     $self->{column_prev} = $self->{column};
6809     $self->{column}++;
6810     $self->{nc}
6811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6812     } else {
6813     $self->{set_nc}->($self);
6814     }
6815    
6816     return ($self->{ct});
6817     redo A;
6818     } else {
6819     ## XML5: Switch to the "DOCTYPE bogus comment state".
6820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6821     $self->{ca}->{value} = '';
6822     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6823     ## Reconsume.
6824     redo A;
6825     }
6826     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6827     if ($is_space->{$self->{nc}}) {
6828     ## Stay in the state.
6829    
6830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6831     $self->{line_prev} = $self->{line};
6832     $self->{column_prev} = $self->{column};
6833     $self->{column}++;
6834     $self->{nc}
6835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6836     } else {
6837     $self->{set_nc}->($self);
6838     }
6839    
6840     redo A;
6841     } elsif ($self->{nc} == 0x007C) { # |
6842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6843     ## Stay in the state.
6844    
6845     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6846     $self->{line_prev} = $self->{line};
6847     $self->{column_prev} = $self->{column};
6848     $self->{column}++;
6849     $self->{nc}
6850     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6851     } else {
6852     $self->{set_nc}->($self);
6853     }
6854    
6855     redo A;
6856     } elsif ($self->{nc} == 0x0029) { # )
6857     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6858     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6859    
6860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6861     $self->{line_prev} = $self->{line};
6862     $self->{column_prev} = $self->{column};
6863     $self->{column}++;
6864     $self->{nc}
6865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6866     } else {
6867     $self->{set_nc}->($self);
6868     }
6869    
6870     redo A;
6871     } elsif ($self->{nc} == 0x003E) { # >
6872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6873     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6874    
6875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876     $self->{line_prev} = $self->{line};
6877     $self->{column_prev} = $self->{column};
6878     $self->{column}++;
6879     $self->{nc}
6880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881     } else {
6882     $self->{set_nc}->($self);
6883     }
6884    
6885     return ($self->{ct}); # ATTLIST
6886     redo A;
6887     } elsif ($self->{nc} == -1) {
6888     ## XML5: No parse error.
6889     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6890     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6891    
6892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6893     $self->{line_prev} = $self->{line};
6894     $self->{column_prev} = $self->{column};
6895     $self->{column}++;
6896     $self->{nc}
6897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6898     } else {
6899     $self->{set_nc}->($self);
6900     }
6901    
6902     return ($self->{ct});
6903     redo A;
6904     } else {
6905     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6906     $self->{state} = ALLOWED_TOKEN_STATE;
6907    
6908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6909     $self->{line_prev} = $self->{line};
6910     $self->{column_prev} = $self->{column};
6911     $self->{column}++;
6912     $self->{nc}
6913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6914     } else {
6915     $self->{set_nc}->($self);
6916     }
6917    
6918     redo A;
6919     }
6920     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6921     if ($is_space->{$self->{nc}}) {
6922     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6923    
6924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925     $self->{line_prev} = $self->{line};
6926     $self->{column_prev} = $self->{column};
6927     $self->{column}++;
6928     $self->{nc}
6929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930     } else {
6931     $self->{set_nc}->($self);
6932     }
6933    
6934     redo A;
6935     } elsif ($self->{nc} == 0x007C) { # |
6936     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6937    
6938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939     $self->{line_prev} = $self->{line};
6940     $self->{column_prev} = $self->{column};
6941     $self->{column}++;
6942     $self->{nc}
6943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944     } else {
6945     $self->{set_nc}->($self);
6946     }
6947    
6948     redo A;
6949     } elsif ($self->{nc} == 0x0029) { # )
6950     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6951    
6952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6953     $self->{line_prev} = $self->{line};
6954     $self->{column_prev} = $self->{column};
6955     $self->{column}++;
6956     $self->{nc}
6957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6958     } else {
6959     $self->{set_nc}->($self);
6960     }
6961    
6962     redo A;
6963     } elsif ($self->{nc} == 0x003E) { # >
6964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6965     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6966    
6967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6968     $self->{line_prev} = $self->{line};
6969     $self->{column_prev} = $self->{column};
6970     $self->{column}++;
6971     $self->{nc}
6972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6973     } else {
6974     $self->{set_nc}->($self);
6975     }
6976    
6977     return ($self->{ct}); # ATTLIST
6978     redo A;
6979     } elsif ($self->{nc} == -1) {
6980     ## XML5: No parse error.
6981     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6982     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6983    
6984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985     $self->{line_prev} = $self->{line};
6986     $self->{column_prev} = $self->{column};
6987     $self->{column}++;
6988     $self->{nc}
6989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990     } else {
6991     $self->{set_nc}->($self);
6992     }
6993    
6994     return ($self->{ct});
6995     redo A;
6996     } else {
6997     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6998     ## Stay in the state.
6999    
7000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001     $self->{line_prev} = $self->{line};
7002     $self->{column_prev} = $self->{column};
7003     $self->{column}++;
7004     $self->{nc}
7005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006     } else {
7007     $self->{set_nc}->($self);
7008     }
7009    
7010     redo A;
7011     }
7012     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7013     if ($is_space->{$self->{nc}}) {
7014     ## Stay in the state.
7015    
7016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7017     $self->{line_prev} = $self->{line};
7018     $self->{column_prev} = $self->{column};
7019     $self->{column}++;
7020     $self->{nc}
7021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7022     } else {
7023     $self->{set_nc}->($self);
7024     }
7025    
7026     redo A;
7027     } elsif ($self->{nc} == 0x007C) { # |
7028     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7029    
7030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7031     $self->{line_prev} = $self->{line};
7032     $self->{column_prev} = $self->{column};
7033     $self->{column}++;
7034     $self->{nc}
7035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7036     } else {
7037     $self->{set_nc}->($self);
7038     }
7039    
7040     redo A;
7041     } elsif ($self->{nc} == 0x0029) { # )
7042     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7043    
7044     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7045     $self->{line_prev} = $self->{line};
7046     $self->{column_prev} = $self->{column};
7047     $self->{column}++;
7048     $self->{nc}
7049     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7050     } else {
7051     $self->{set_nc}->($self);
7052     }
7053    
7054     redo A;
7055     } elsif ($self->{nc} == 0x003E) { # >
7056     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7057     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7058    
7059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7060     $self->{line_prev} = $self->{line};
7061     $self->{column_prev} = $self->{column};
7062     $self->{column}++;
7063     $self->{nc}
7064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7065     } else {
7066     $self->{set_nc}->($self);
7067     }
7068    
7069     return ($self->{ct}); # ATTLIST
7070     redo A;
7071     } elsif ($self->{nc} == -1) {
7072     ## XML5: No parse error.
7073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7074     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7075    
7076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077     $self->{line_prev} = $self->{line};
7078     $self->{column_prev} = $self->{column};
7079     $self->{column}++;
7080     $self->{nc}
7081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082     } else {
7083     $self->{set_nc}->($self);
7084     }
7085    
7086     return ($self->{ct});
7087     redo A;
7088     } else {
7089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7090     line => $self->{line_prev},
7091     column => $self->{column_prev});
7092     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7093     $self->{state} = ALLOWED_TOKEN_STATE;
7094    
7095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7096     $self->{line_prev} = $self->{line};
7097     $self->{column_prev} = $self->{column};
7098     $self->{column}++;
7099     $self->{nc}
7100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7101     } else {
7102     $self->{set_nc}->($self);
7103     }
7104    
7105     redo A;
7106     }
7107     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7108     if ($is_space->{$self->{nc}}) {
7109     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7110    
7111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112     $self->{line_prev} = $self->{line};
7113     $self->{column_prev} = $self->{column};
7114     $self->{column}++;
7115     $self->{nc}
7116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117     } else {
7118     $self->{set_nc}->($self);
7119     }
7120    
7121     redo A;
7122     } elsif ($self->{nc} == 0x0023) { # #
7123     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7124     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7125    
7126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7127     $self->{line_prev} = $self->{line};
7128     $self->{column_prev} = $self->{column};
7129     $self->{column}++;
7130     $self->{nc}
7131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7132     } else {
7133     $self->{set_nc}->($self);
7134     }
7135    
7136     redo A;
7137     } elsif ($self->{nc} == 0x0022) { # "
7138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7139     $self->{ca}->{value} = '';
7140     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7141    
7142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7143     $self->{line_prev} = $self->{line};
7144     $self->{column_prev} = $self->{column};
7145     $self->{column}++;
7146     $self->{nc}
7147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7148     } else {
7149     $self->{set_nc}->($self);
7150     }
7151    
7152     redo A;
7153     } elsif ($self->{nc} == 0x0027) { # '
7154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7155     $self->{ca}->{value} = '';
7156     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7157    
7158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7159     $self->{line_prev} = $self->{line};
7160     $self->{column_prev} = $self->{column};
7161     $self->{column}++;
7162     $self->{nc}
7163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7164     } else {
7165     $self->{set_nc}->($self);
7166     }
7167    
7168     redo A;
7169     } elsif ($self->{nc} == 0x003E) { # >
7170     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7171     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7172    
7173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7174     $self->{line_prev} = $self->{line};
7175     $self->{column_prev} = $self->{column};
7176     $self->{column}++;
7177     $self->{nc}
7178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7179     } else {
7180     $self->{set_nc}->($self);
7181     }
7182    
7183     return ($self->{ct}); # ATTLIST
7184     redo A;
7185     } elsif ($self->{nc} == -1) {
7186     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7187     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7188    
7189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7190     $self->{line_prev} = $self->{line};
7191     $self->{column_prev} = $self->{column};
7192     $self->{column}++;
7193     $self->{nc}
7194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7195     } else {
7196     $self->{set_nc}->($self);
7197     }
7198    
7199     return ($self->{ct});
7200     redo A;
7201     } else {
7202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7203     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7204     ## Reconsume.
7205     redo A;
7206     }
7207     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7208     if ($is_space->{$self->{nc}}) {
7209     ## Stay in the state.
7210    
7211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212     $self->{line_prev} = $self->{line};
7213     $self->{column_prev} = $self->{column};
7214     $self->{column}++;
7215     $self->{nc}
7216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217     } else {
7218     $self->{set_nc}->($self);
7219     }
7220    
7221     redo A;
7222     } elsif ($self->{nc} == 0x0023) { # #
7223     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7224    
7225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7226     $self->{line_prev} = $self->{line};
7227     $self->{column_prev} = $self->{column};
7228     $self->{column}++;
7229     $self->{nc}
7230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7231     } else {
7232     $self->{set_nc}->($self);
7233     }
7234    
7235     redo A;
7236     } elsif ($self->{nc} == 0x0022) { # "
7237     $self->{ca}->{value} = '';
7238     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7239    
7240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7241     $self->{line_prev} = $self->{line};
7242     $self->{column_prev} = $self->{column};
7243     $self->{column}++;
7244     $self->{nc}
7245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7246     } else {
7247     $self->{set_nc}->($self);
7248     }
7249    
7250     redo A;
7251     } elsif ($self->{nc} == 0x0027) { # '
7252     $self->{ca}->{value} = '';
7253     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7254    
7255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7256     $self->{line_prev} = $self->{line};
7257     $self->{column_prev} = $self->{column};
7258     $self->{column}++;
7259     $self->{nc}
7260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7261     } else {
7262     $self->{set_nc}->($self);
7263     }
7264    
7265     redo A;
7266     } elsif ($self->{nc} == 0x003E) { # >
7267     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7268     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269    
7270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271     $self->{line_prev} = $self->{line};
7272     $self->{column_prev} = $self->{column};
7273     $self->{column}++;
7274     $self->{nc}
7275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276     } else {
7277     $self->{set_nc}->($self);
7278     }
7279    
7280     return ($self->{ct}); # ATTLIST
7281     redo A;
7282     } elsif ($self->{nc} == -1) {
7283     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7284     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7285    
7286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7287     $self->{line_prev} = $self->{line};
7288     $self->{column_prev} = $self->{column};
7289     $self->{column}++;
7290     $self->{nc}
7291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7292     } else {
7293     $self->{set_nc}->($self);
7294     }
7295    
7296     return ($self->{ct});
7297     redo A;
7298     } else {
7299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7300     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7301     ## Reconsume.
7302     redo A;
7303     }
7304     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7305     if ($is_space->{$self->{nc}}) {
7306     ## XML5: No parse error.
7307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7308 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7309 wakaba 1.15 ## Reconsume.
7310     redo A;
7311     } elsif ($self->{nc} == 0x0022) { # "
7312     ## XML5: Same as "anything else".
7313     $self->{ca}->{value} = '';
7314     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7315    
7316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317     $self->{line_prev} = $self->{line};
7318     $self->{column_prev} = $self->{column};
7319     $self->{column}++;
7320     $self->{nc}
7321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322     } else {
7323     $self->{set_nc}->($self);
7324     }
7325    
7326     redo A;
7327     } elsif ($self->{nc} == 0x0027) { # '
7328     ## XML5: Same as "anything else".
7329     $self->{ca}->{value} = '';
7330     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7331    
7332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333     $self->{line_prev} = $self->{line};
7334     $self->{column_prev} = $self->{column};
7335     $self->{column}++;
7336     $self->{nc}
7337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338     } else {
7339     $self->{set_nc}->($self);
7340     }
7341    
7342     redo A;
7343     } elsif ($self->{nc} == 0x003E) { # >
7344     ## XML5: Same as "anything else".
7345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7346     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7347    
7348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7349     $self->{line_prev} = $self->{line};
7350     $self->{column_prev} = $self->{column};
7351     $self->{column}++;
7352     $self->{nc}
7353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7354     } else {
7355     $self->{set_nc}->($self);
7356     }
7357    
7358     return ($self->{ct}); # ATTLIST
7359     redo A;
7360     } elsif ($self->{nc} == -1) {
7361     ## XML5: No parse error.
7362     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7364    
7365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7366     $self->{line_prev} = $self->{line};
7367     $self->{column_prev} = $self->{column};
7368     $self->{column}++;
7369     $self->{nc}
7370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7371     } else {
7372     $self->{set_nc}->($self);
7373     }
7374    
7375     return ($self->{ct});
7376     redo A;
7377     } else {
7378     $self->{ca}->{default} = chr $self->{nc};
7379     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7380    
7381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7382     $self->{line_prev} = $self->{line};
7383     $self->{column_prev} = $self->{column};
7384     $self->{column}++;
7385     $self->{nc}
7386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7387     } else {
7388     $self->{set_nc}->($self);
7389     }
7390    
7391     redo A;
7392     }
7393     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7394     if ($is_space->{$self->{nc}}) {
7395     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7396    
7397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7398     $self->{line_prev} = $self->{line};
7399     $self->{column_prev} = $self->{column};
7400     $self->{column}++;
7401     $self->{nc}
7402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7403     } else {
7404     $self->{set_nc}->($self);
7405     }
7406    
7407     redo A;
7408     } elsif ($self->{nc} == 0x0022) { # "
7409     ## XML5: Same as "anything else".
7410     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7411     $self->{ca}->{value} = '';
7412     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7413    
7414     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7415     $self->{line_prev} = $self->{line};
7416     $self->{column_prev} = $self->{column};
7417     $self->{column}++;
7418     $self->{nc}
7419     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7420     } else {
7421     $self->{set_nc}->($self);
7422     }
7423    
7424     redo A;
7425     } elsif ($self->{nc} == 0x0027) { # '
7426     ## XML5: Same as "anything else".
7427     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7428     $self->{ca}->{value} = '';
7429     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7430    
7431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432     $self->{line_prev} = $self->{line};
7433     $self->{column_prev} = $self->{column};
7434     $self->{column}++;
7435     $self->{nc}
7436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437     } else {
7438     $self->{set_nc}->($self);
7439     }
7440    
7441     redo A;
7442     } elsif ($self->{nc} == 0x003E) { # >
7443     ## XML5: Same as "anything else".
7444     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7445     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7446    
7447     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7448     $self->{line_prev} = $self->{line};
7449     $self->{column_prev} = $self->{column};
7450     $self->{column}++;
7451     $self->{nc}
7452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7453     } else {
7454     $self->{set_nc}->($self);
7455     }
7456    
7457     return ($self->{ct}); # ATTLIST
7458     redo A;
7459     } elsif ($self->{nc} == -1) {
7460     ## XML5: No parse error.
7461     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7462     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7463     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7464    
7465     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466     $self->{line_prev} = $self->{line};
7467     $self->{column_prev} = $self->{column};
7468     $self->{column}++;
7469     $self->{nc}
7470     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471     } else {
7472     $self->{set_nc}->($self);
7473     }
7474    
7475     return ($self->{ct});
7476     redo A;
7477     } else {
7478     $self->{ca}->{default} .= chr $self->{nc};
7479     ## Stay in the state.
7480    
7481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7482     $self->{line_prev} = $self->{line};
7483     $self->{column_prev} = $self->{column};
7484     $self->{column}++;
7485     $self->{nc}
7486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7487     } else {
7488     $self->{set_nc}->($self);
7489     }
7490    
7491     redo A;
7492     }
7493     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7494     if ($is_space->{$self->{nc}}) {
7495     ## Stay in the state.
7496    
7497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7498     $self->{line_prev} = $self->{line};
7499     $self->{column_prev} = $self->{column};
7500     $self->{column}++;
7501     $self->{nc}
7502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7503     } else {
7504     $self->{set_nc}->($self);
7505     }
7506    
7507     redo A;
7508     } elsif ($self->{nc} == 0x0022) { # "
7509     $self->{ca}->{value} = '';
7510     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7511    
7512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7513     $self->{line_prev} = $self->{line};
7514     $self->{column_prev} = $self->{column};
7515     $self->{column}++;
7516     $self->{nc}
7517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7518     } else {
7519     $self->{set_nc}->($self);
7520     }
7521    
7522     redo A;
7523     } elsif ($self->{nc} == 0x0027) { # '
7524     $self->{ca}->{value} = '';
7525     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7526    
7527     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7528     $self->{line_prev} = $self->{line};
7529     $self->{column_prev} = $self->{column};
7530     $self->{column}++;
7531     $self->{nc}
7532     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7533     } else {
7534     $self->{set_nc}->($self);
7535     }
7536    
7537     redo A;
7538     } elsif ($self->{nc} == 0x003E) { # >
7539     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7541    
7542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7543     $self->{line_prev} = $self->{line};
7544     $self->{column_prev} = $self->{column};
7545     $self->{column}++;
7546     $self->{nc}
7547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7548     } else {
7549     $self->{set_nc}->($self);
7550     }
7551    
7552     return ($self->{ct}); # ATTLIST
7553     redo A;
7554     } elsif ($self->{nc} == -1) {
7555     ## XML5: No parse error.
7556     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7557     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7559    
7560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7561     $self->{line_prev} = $self->{line};
7562     $self->{column_prev} = $self->{column};
7563     $self->{column}++;
7564     $self->{nc}
7565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7566     } else {
7567     $self->{set_nc}->($self);
7568     }
7569    
7570     return ($self->{ct});
7571     redo A;
7572     } else {
7573     ## XML5: Not defined yet.
7574     if ($self->{ca}->{default} eq 'FIXED') {
7575     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7576     } else {
7577     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7578     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7579     }
7580     ## Reconsume.
7581     redo A;
7582     }
7583     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7584     if ($is_space->{$self->{nc}} or
7585     $self->{nc} == -1 or
7586     $self->{nc} == 0x003E) { # >
7587     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7588     ## Reconsume.
7589     redo A;
7590     } else {
7591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7592     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7593     ## Reconsume.
7594     redo A;
7595 wakaba 1.16 }
7596 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7597     ## ASCII case-insensitive
7598     if ($self->{nc} == [
7599     undef,
7600     0x0044, # D
7601     0x0041, # A
7602     0x0054, # T
7603     ]->[length $self->{kwd}] or
7604     $self->{nc} == [
7605     undef,
7606     0x0064, # d
7607     0x0061, # a
7608     0x0074, # t
7609     ]->[length $self->{kwd}]) {
7610    
7611     ## Stay in the state.
7612     $self->{kwd} .= chr $self->{nc};
7613    
7614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7615     $self->{line_prev} = $self->{line};
7616     $self->{column_prev} = $self->{column};
7617     $self->{column}++;
7618     $self->{nc}
7619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7620     } else {
7621     $self->{set_nc}->($self);
7622     }
7623    
7624     redo A;
7625     } elsif ((length $self->{kwd}) == 4 and
7626     ($self->{nc} == 0x0041 or # A
7627     $self->{nc} == 0x0061)) { # a
7628     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7629    
7630     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7631     text => 'NDATA',
7632     line => $self->{line_prev},
7633     column => $self->{column_prev} - 4);
7634     } else {
7635    
7636     }
7637     $self->{state} = AFTER_NDATA_STATE;
7638    
7639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7640     $self->{line_prev} = $self->{line};
7641     $self->{column_prev} = $self->{column};
7642     $self->{column}++;
7643     $self->{nc}
7644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7645     } else {
7646     $self->{set_nc}->($self);
7647     }
7648    
7649     redo A;
7650     } else {
7651     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7652     line => $self->{line_prev},
7653     column => $self->{column_prev} + 1
7654     - length $self->{kwd});
7655    
7656     $self->{state} = BOGUS_MD_STATE;
7657     ## Reconsume.
7658     redo A;
7659     }
7660     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7661     if ($is_space->{$self->{nc}}) {
7662     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7663    
7664     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665     $self->{line_prev} = $self->{line};
7666     $self->{column_prev} = $self->{column};
7667     $self->{column}++;
7668     $self->{nc}
7669     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670     } else {
7671     $self->{set_nc}->($self);
7672     }
7673    
7674     redo A;
7675     } elsif ($self->{nc} == 0x003E) { # >
7676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7677     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7678    
7679     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7680     $self->{line_prev} = $self->{line};
7681     $self->{column_prev} = $self->{column};
7682     $self->{column}++;
7683     $self->{nc}
7684     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7685     } else {
7686     $self->{set_nc}->($self);
7687     }
7688    
7689     return ($self->{ct}); # ENTITY
7690     redo A;
7691     } elsif ($self->{nc} == -1) {
7692     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7693     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7694    
7695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7696     $self->{line_prev} = $self->{line};
7697     $self->{column_prev} = $self->{column};
7698     $self->{column}++;
7699     $self->{nc}
7700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7701     } else {
7702     $self->{set_nc}->($self);
7703     }
7704    
7705     return ($self->{ct}); # ENTITY
7706     redo A;
7707     } else {
7708     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7709     line => $self->{line_prev},
7710     column => $self->{column_prev} + 1
7711     - length $self->{kwd});
7712     $self->{state} = BOGUS_MD_STATE;
7713     ## Reconsume.
7714     redo A;
7715     }
7716     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7717     if ($is_space->{$self->{nc}}) {
7718     ## Stay in the state.
7719    
7720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7721     $self->{line_prev} = $self->{line};
7722     $self->{column_prev} = $self->{column};
7723     $self->{column}++;
7724     $self->{nc}
7725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7726     } else {
7727     $self->{set_nc}->($self);
7728     }
7729    
7730     redo A;
7731     } elsif ($self->{nc} == 0x003E) { # >
7732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7733     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7734    
7735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7736     $self->{line_prev} = $self->{line};
7737     $self->{column_prev} = $self->{column};
7738     $self->{column}++;
7739     $self->{nc}
7740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7741     } else {
7742     $self->{set_nc}->($self);
7743     }
7744    
7745     return ($self->{ct}); # ENTITY
7746     redo A;
7747     } elsif ($self->{nc} == -1) {
7748     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7749     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7750    
7751     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7752     $self->{line_prev} = $self->{line};
7753     $self->{column_prev} = $self->{column};
7754     $self->{column}++;
7755     $self->{nc}
7756     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7757     } else {
7758     $self->{set_nc}->($self);
7759     }
7760    
7761     return ($self->{ct}); # ENTITY
7762     redo A;
7763     } else {
7764     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7765     $self->{state} = NOTATION_NAME_STATE;
7766    
7767     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7768     $self->{line_prev} = $self->{line};
7769     $self->{column_prev} = $self->{column};
7770     $self->{column}++;
7771     $self->{nc}
7772     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7773     } else {
7774     $self->{set_nc}->($self);
7775     }
7776    
7777     redo A;
7778     }
7779     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7780     if ($is_space->{$self->{nc}}) {
7781 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7782 wakaba 1.18
7783     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7784     $self->{line_prev} = $self->{line};
7785     $self->{column_prev} = $self->{column};
7786     $self->{column}++;
7787     $self->{nc}
7788     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7789     } else {
7790     $self->{set_nc}->($self);
7791     }
7792    
7793     redo A;
7794     } elsif ($self->{nc} == 0x003E) { # >
7795     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7796    
7797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7798     $self->{line_prev} = $self->{line};
7799     $self->{column_prev} = $self->{column};
7800     $self->{column}++;
7801     $self->{nc}
7802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7803     } else {
7804     $self->{set_nc}->($self);
7805     }
7806    
7807     return ($self->{ct}); # ENTITY
7808     redo A;
7809     } elsif ($self->{nc} == -1) {
7810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7811     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7812    
7813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7814     $self->{line_prev} = $self->{line};
7815     $self->{column_prev} = $self->{column};
7816     $self->{column}++;
7817     $self->{nc}
7818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7819     } else {
7820     $self->{set_nc}->($self);
7821     }
7822    
7823     return ($self->{ct}); # ENTITY
7824     redo A;
7825     } else {
7826     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7827     ## Stay in the state.
7828    
7829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7830     $self->{line_prev} = $self->{line};
7831     $self->{column_prev} = $self->{column};
7832     $self->{column}++;
7833     $self->{nc}
7834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7835     } else {
7836     $self->{set_nc}->($self);
7837     }
7838    
7839     redo A;
7840     }
7841 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7842     if ($self->{nc} == 0x0022) { # "
7843 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7844 wakaba 1.19
7845     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7846     $self->{line_prev} = $self->{line};
7847     $self->{column_prev} = $self->{column};
7848     $self->{column}++;
7849     $self->{nc}
7850     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7851     } else {
7852     $self->{set_nc}->($self);
7853     }
7854    
7855     redo A;
7856     } elsif ($self->{nc} == 0x0026) { # &
7857     $self->{prev_state} = $self->{state};
7858     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7859     $self->{entity_add} = 0x0022; # "
7860    
7861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7862     $self->{line_prev} = $self->{line};
7863     $self->{column_prev} = $self->{column};
7864     $self->{column}++;
7865     $self->{nc}
7866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7867     } else {
7868     $self->{set_nc}->($self);
7869     }
7870    
7871     redo A;
7872     ## TODO: %
7873     } elsif ($self->{nc} == -1) {
7874     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7875     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7876     ## Reconsume.
7877     return ($self->{ct}); # ENTITY
7878     redo A;
7879     } else {
7880     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7881    
7882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7883     $self->{line_prev} = $self->{line};
7884     $self->{column_prev} = $self->{column};
7885     $self->{column}++;
7886     $self->{nc}
7887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7888     } else {
7889     $self->{set_nc}->($self);
7890     }
7891    
7892     redo A;
7893     }
7894     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7895     if ($self->{nc} == 0x0027) { # '
7896 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7897 wakaba 1.19
7898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7899     $self->{line_prev} = $self->{line};
7900     $self->{column_prev} = $self->{column};
7901     $self->{column}++;
7902     $self->{nc}
7903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7904     } else {
7905     $self->{set_nc}->($self);
7906     }
7907    
7908     redo A;
7909     } elsif ($self->{nc} == 0x0026) { # &
7910     $self->{prev_state} = $self->{state};
7911     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7912     $self->{entity_add} = 0x0027; # '
7913    
7914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7915     $self->{line_prev} = $self->{line};
7916     $self->{column_prev} = $self->{column};
7917     $self->{column}++;
7918     $self->{nc}
7919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7920     } else {
7921     $self->{set_nc}->($self);
7922     }
7923    
7924     redo A;
7925     ## TODO: %
7926     } elsif ($self->{nc} == -1) {
7927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7928     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7929     ## Reconsume.
7930     return ($self->{ct}); # ENTITY
7931     redo A;
7932     } else {
7933     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7934    
7935     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7936     $self->{line_prev} = $self->{line};
7937     $self->{column_prev} = $self->{column};
7938     $self->{column}++;
7939     $self->{nc}
7940     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7941     } else {
7942     $self->{set_nc}->($self);
7943     }
7944    
7945     redo A;
7946     }
7947     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7948     if ($is_space->{$self->{nc}} or
7949     {
7950     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7951     $self->{entity_add} => 1,
7952     }->{$self->{nc}}) {
7953 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7954     line => $self->{line_prev},
7955     column => $self->{column_prev}
7956     + ($self->{nc} == -1 ? 1 : 0));
7957 wakaba 1.19 ## Don't consume
7958     ## Return nothing.
7959     #
7960     } elsif ($self->{nc} == 0x0023) { # #
7961     $self->{ca} = $self->{ct};
7962     $self->{state} = ENTITY_HASH_STATE;
7963     $self->{kwd} = '#';
7964    
7965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7966     $self->{line_prev} = $self->{line};
7967     $self->{column_prev} = $self->{column};
7968     $self->{column}++;
7969     $self->{nc}
7970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7971     } else {
7972     $self->{set_nc}->($self);
7973     }
7974    
7975     redo A;
7976     } else {
7977     #
7978     }
7979    
7980     $self->{ct}->{value} .= '&';
7981     $self->{state} = $self->{prev_state};
7982     ## Reconsume.
7983     redo A;
7984 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7985     if ($is_space->{$self->{nc}}) {
7986     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7987    
7988     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7989     $self->{line_prev} = $self->{line};
7990     $self->{column_prev} = $self->{column};
7991     $self->{column}++;
7992     $self->{nc}
7993     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7994     } else {
7995     $self->{set_nc}->($self);
7996     }
7997    
7998     redo A;
7999     } elsif ($self->{nc} == 0x0028) { # (
8000     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8001     $self->{ct}->{content} = ['('];
8002     $self->{group_depth} = 1;
8003    
8004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8005     $self->{line_prev} = $self->{line};
8006     $self->{column_prev} = $self->{column};
8007     $self->{column}++;
8008     $self->{nc}
8009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8010     } else {
8011     $self->{set_nc}->($self);
8012     }
8013    
8014     redo A;
8015     } elsif ($self->{nc} == 0x003E) { # >
8016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8017     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8018    
8019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8020     $self->{line_prev} = $self->{line};
8021     $self->{column_prev} = $self->{column};
8022     $self->{column}++;
8023     $self->{nc}
8024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8025     } else {
8026     $self->{set_nc}->($self);
8027     }
8028    
8029     return ($self->{ct}); # ELEMENT
8030     redo A;
8031     } elsif ($self->{nc} == -1) {
8032     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8033     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8034    
8035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8036     $self->{line_prev} = $self->{line};
8037     $self->{column_prev} = $self->{column};
8038     $self->{column}++;
8039     $self->{nc}
8040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8041     } else {
8042     $self->{set_nc}->($self);
8043     }
8044    
8045     return ($self->{ct}); # ELEMENT
8046     redo A;
8047     } else {
8048     $self->{ct}->{content} = [chr $self->{nc}];
8049     $self->{state} = CONTENT_KEYWORD_STATE;
8050    
8051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8052     $self->{line_prev} = $self->{line};
8053     $self->{column_prev} = $self->{column};
8054     $self->{column}++;
8055     $self->{nc}
8056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8057     } else {
8058     $self->{set_nc}->($self);
8059     }
8060    
8061     redo A;
8062     }
8063     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8064     if ($is_space->{$self->{nc}}) {
8065     $self->{state} = AFTER_MD_DEF_STATE;
8066    
8067     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8068     $self->{line_prev} = $self->{line};
8069     $self->{column_prev} = $self->{column};
8070     $self->{column}++;
8071     $self->{nc}
8072     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8073     } else {
8074     $self->{set_nc}->($self);
8075     }
8076    
8077     redo A;
8078     } elsif ($self->{nc} == 0x003E) { # >
8079     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8080    
8081     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8082     $self->{line_prev} = $self->{line};
8083     $self->{column_prev} = $self->{column};
8084     $self->{column}++;
8085     $self->{nc}
8086     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8087     } else {
8088     $self->{set_nc}->($self);
8089     }
8090    
8091     return ($self->{ct}); # ELEMENT
8092     redo A;
8093     } elsif ($self->{nc} == -1) {
8094     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8095     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8096    
8097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8098     $self->{line_prev} = $self->{line};
8099     $self->{column_prev} = $self->{column};
8100     $self->{column}++;
8101     $self->{nc}
8102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8103     } else {
8104     $self->{set_nc}->($self);
8105     }
8106    
8107     return ($self->{ct}); # ELEMENT
8108     redo A;
8109     } else {
8110     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8111     ## Stay in the state.
8112    
8113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8114     $self->{line_prev} = $self->{line};
8115     $self->{column_prev} = $self->{column};
8116     $self->{column}++;
8117     $self->{nc}
8118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8119     } else {
8120     $self->{set_nc}->($self);
8121     }
8122    
8123     redo A;
8124     }
8125     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8126     if ($is_space->{$self->{nc}}) {
8127     ## Stay in the state.
8128    
8129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8130     $self->{line_prev} = $self->{line};
8131     $self->{column_prev} = $self->{column};
8132     $self->{column}++;
8133     $self->{nc}
8134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8135     } else {
8136     $self->{set_nc}->($self);
8137     }
8138    
8139     redo A;
8140     } elsif ($self->{nc} == 0x0028) { # (
8141     $self->{group_depth}++;
8142     push @{$self->{ct}->{content}}, chr $self->{nc};
8143     ## Stay in the state.
8144    
8145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8146     $self->{line_prev} = $self->{line};
8147     $self->{column_prev} = $self->{column};
8148     $self->{column}++;
8149     $self->{nc}
8150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8151     } else {
8152     $self->{set_nc}->($self);
8153     }
8154    
8155     redo A;
8156     } elsif ($self->{nc} == 0x007C or # |
8157     $self->{nc} == 0x002C) { # ,
8158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8159     ## Stay in the state.
8160    
8161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8162     $self->{line_prev} = $self->{line};
8163     $self->{column_prev} = $self->{column};
8164     $self->{column}++;
8165     $self->{nc}
8166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8167     } else {
8168     $self->{set_nc}->($self);
8169     }
8170    
8171     redo A;
8172     } elsif ($self->{nc} == 0x0029) { # )
8173     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8174     push @{$self->{ct}->{content}}, chr $self->{nc};
8175     $self->{group_depth}--;
8176     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8177    
8178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8179     $self->{line_prev} = $self->{line};
8180     $self->{column_prev} = $self->{column};
8181     $self->{column}++;
8182     $self->{nc}
8183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8184     } else {
8185     $self->{set_nc}->($self);
8186     }
8187    
8188     redo A;
8189     } elsif ($self->{nc} == 0x003E) { # >
8190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8191     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8192     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8193    
8194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8195     $self->{line_prev} = $self->{line};
8196     $self->{column_prev} = $self->{column};
8197     $self->{column}++;
8198     $self->{nc}
8199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8200     } else {
8201     $self->{set_nc}->($self);
8202     }
8203    
8204     return ($self->{ct}); # ELEMENT
8205     redo A;
8206     } elsif ($self->{nc} == -1) {
8207     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8208     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8210    
8211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8212     $self->{line_prev} = $self->{line};
8213     $self->{column_prev} = $self->{column};
8214     $self->{column}++;
8215     $self->{nc}
8216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8217     } else {
8218     $self->{set_nc}->($self);
8219     }
8220    
8221     return ($self->{ct}); # ELEMENT
8222     redo A;
8223     } else {
8224     push @{$self->{ct}->{content}}, chr $self->{nc};
8225     $self->{state} = CM_ELEMENT_NAME_STATE;
8226    
8227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8228     $self->{line_prev} = $self->{line};
8229     $self->{column_prev} = $self->{column};
8230     $self->{column}++;
8231     $self->{nc}
8232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8233     } else {
8234     $self->{set_nc}->($self);
8235     }
8236    
8237     redo A;
8238     }
8239     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8240     if ($is_space->{$self->{nc}}) {
8241     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8242    
8243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244     $self->{line_prev} = $self->{line};
8245     $self->{column_prev} = $self->{column};
8246     $self->{column}++;
8247     $self->{nc}
8248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249     } else {
8250     $self->{set_nc}->($self);
8251     }
8252    
8253     redo A;
8254     } elsif ($self->{nc} == 0x002A or # *
8255     $self->{nc} == 0x002B or # +
8256     $self->{nc} == 0x003F) { # ?
8257     push @{$self->{ct}->{content}}, chr $self->{nc};
8258     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8259    
8260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8261     $self->{line_prev} = $self->{line};
8262     $self->{column_prev} = $self->{column};
8263     $self->{column}++;
8264     $self->{nc}
8265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8266     } else {
8267     $self->{set_nc}->($self);
8268     }
8269    
8270     redo A;
8271     } elsif ($self->{nc} == 0x007C or # |
8272     $self->{nc} == 0x002C) { # ,
8273     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8274     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8275    
8276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8277     $self->{line_prev} = $self->{line};
8278     $self->{column_prev} = $self->{column};
8279     $self->{column}++;
8280     $self->{nc}
8281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8282     } else {
8283     $self->{set_nc}->($self);
8284     }
8285    
8286     redo A;
8287     } elsif ($self->{nc} == 0x0029) { # )
8288     $self->{group_depth}--;
8289     push @{$self->{ct}->{content}}, chr $self->{nc};
8290     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8291    
8292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293     $self->{line_prev} = $self->{line};
8294     $self->{column_prev} = $self->{column};
8295     $self->{column}++;
8296     $self->{nc}
8297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298     } else {
8299     $self->{set_nc}->($self);
8300     }
8301    
8302     redo A;
8303     } elsif ($self->{nc} == 0x003E) { # >
8304     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8305     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8307    
8308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309     $self->{line_prev} = $self->{line};
8310     $self->{column_prev} = $self->{column};
8311     $self->{column}++;
8312     $self->{nc}
8313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314     } else {
8315     $self->{set_nc}->($self);
8316     }
8317    
8318     return ($self->{ct}); # ELEMENT
8319     redo A;
8320     } elsif ($self->{nc} == -1) {
8321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8322     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8323     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8324    
8325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326     $self->{line_prev} = $self->{line};
8327     $self->{column_prev} = $self->{column};
8328     $self->{column}++;
8329     $self->{nc}
8330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331     } else {
8332     $self->{set_nc}->($self);
8333     }
8334    
8335     return ($self->{ct}); # ELEMENT
8336     redo A;
8337     } else {
8338     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8339     ## Stay in the state.
8340    
8341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342     $self->{line_prev} = $self->{line};
8343     $self->{column_prev} = $self->{column};
8344     $self->{column}++;
8345     $self->{nc}
8346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347     } else {
8348     $self->{set_nc}->($self);
8349     }
8350    
8351     redo A;
8352     }
8353     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8354     if ($is_space->{$self->{nc}}) {
8355     ## Stay in the state.
8356    
8357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358     $self->{line_prev} = $self->{line};
8359     $self->{column_prev} = $self->{column};
8360     $self->{column}++;
8361     $self->{nc}
8362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363     } else {
8364     $self->{set_nc}->($self);
8365     }
8366    
8367     redo A;
8368     } elsif ($self->{nc} == 0x007C or # |
8369     $self->{nc} == 0x002C) { # ,
8370     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8371     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8372    
8373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374     $self->{line_prev} = $self->{line};
8375     $self->{column_prev} = $self->{column};
8376     $self->{column}++;
8377     $self->{nc}
8378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379     } else {
8380     $self->{set_nc}->($self);
8381     }
8382    
8383     redo A;
8384     } elsif ($self->{nc} == 0x0029) { # )
8385     $self->{group_depth}--;
8386     push @{$self->{ct}->{content}}, chr $self->{nc};
8387     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8388    
8389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390     $self->{line_prev} = $self->{line};
8391     $self->{column_prev} = $self->{column};
8392     $self->{column}++;
8393     $self->{nc}
8394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395     } else {
8396     $self->{set_nc}->($self);
8397     }
8398    
8399     redo A;
8400     } elsif ($self->{nc} == 0x003E) { # >
8401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8402     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8403     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8404    
8405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8406     $self->{line_prev} = $self->{line};
8407     $self->{column_prev} = $self->{column};
8408     $self->{column}++;
8409     $self->{nc}
8410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8411     } else {
8412     $self->{set_nc}->($self);
8413     }
8414    
8415     return ($self->{ct}); # ELEMENT
8416     redo A;
8417     } elsif ($self->{nc} == -1) {
8418     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8419     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8420     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421    
8422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423     $self->{line_prev} = $self->{line};
8424     $self->{column_prev} = $self->{column};
8425     $self->{column}++;
8426     $self->{nc}
8427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428     } else {
8429     $self->{set_nc}->($self);
8430     }
8431    
8432     return ($self->{ct}); # ELEMENT
8433     redo A;
8434     } else {
8435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8436     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8437     $self->{state} = BOGUS_MD_STATE;
8438    
8439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440     $self->{line_prev} = $self->{line};
8441     $self->{column_prev} = $self->{column};
8442     $self->{column}++;
8443     $self->{nc}
8444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445     } else {
8446     $self->{set_nc}->($self);
8447     }
8448    
8449     redo A;
8450     }
8451     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8452     if ($is_space->{$self->{nc}}) {
8453     if ($self->{group_depth}) {
8454     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8455     } else {
8456     $self->{state} = AFTER_MD_DEF_STATE;
8457     }
8458    
8459     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8460     $self->{line_prev} = $self->{line};
8461     $self->{column_prev} = $self->{column};
8462     $self->{column}++;
8463     $self->{nc}
8464     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8465     } else {
8466     $self->{set_nc}->($self);
8467     }
8468    
8469     redo A;
8470     } elsif ($self->{nc} == 0x002A or # *
8471     $self->{nc} == 0x002B or # +
8472     $self->{nc} == 0x003F) { # ?
8473     push @{$self->{ct}->{content}}, chr $self->{nc};
8474     if ($self->{group_depth}) {
8475     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8476     } else {
8477     $self->{state} = AFTER_MD_DEF_STATE;
8478     }
8479    
8480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8481     $self->{line_prev} = $self->{line};
8482     $self->{column_prev} = $self->{column};
8483     $self->{column}++;
8484     $self->{nc}
8485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8486     } else {
8487     $self->{set_nc}->($self);
8488     }
8489    
8490     redo A;
8491     } elsif ($self->{nc} == 0x0029) { # )
8492     if ($self->{group_depth}) {
8493     $self->{group_depth}--;
8494     push @{$self->{ct}->{content}}, chr $self->{nc};
8495     ## Stay in the state.
8496    
8497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8498     $self->{line_prev} = $self->{line};
8499     $self->{column_prev} = $self->{column};
8500     $self->{column}++;
8501     $self->{nc}
8502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8503     } else {
8504     $self->{set_nc}->($self);
8505     }
8506    
8507     redo A;
8508     } else {
8509     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8510     $self->{state} = BOGUS_MD_STATE;
8511     ## Reconsume.
8512     redo A;
8513     }
8514     } elsif ($self->{nc} == 0x003E) { # >
8515     if ($self->{group_depth}) {
8516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8517     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518     }
8519     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8520    
8521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8522     $self->{line_prev} = $self->{line};
8523     $self->{column_prev} = $self->{column};
8524     $self->{column}++;
8525     $self->{nc}
8526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8527     } else {
8528     $self->{set_nc}->($self);
8529     }
8530    
8531     return ($self->{ct}); # ELEMENT
8532     redo A;
8533     } elsif ($self->{nc} == -1) {
8534     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8535     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8536     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8537    
8538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8539     $self->{line_prev} = $self->{line};
8540     $self->{column_prev} = $self->{column};
8541     $self->{column}++;
8542     $self->{nc}
8543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8544     } else {
8545     $self->{set_nc}->($self);
8546     }
8547    
8548     return ($self->{ct}); # ELEMENT
8549     redo A;
8550     } else {
8551     if ($self->{group_depth}) {
8552     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8553     } else {
8554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8555     $self->{state} = BOGUS_MD_STATE;
8556     }
8557     ## Reconsume.
8558     redo A;
8559     }
8560     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8561 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8562     ## Stay in the state.
8563    
8564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8565     $self->{line_prev} = $self->{line};
8566     $self->{column_prev} = $self->{column};
8567     $self->{column}++;
8568     $self->{nc}
8569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8570     } else {
8571     $self->{set_nc}->($self);
8572     }
8573    
8574     redo A;
8575     } elsif ($self->{nc} == 0x003E) { # >
8576     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8577    
8578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8579     $self->{line_prev} = $self->{line};
8580     $self->{column_prev} = $self->{column};
8581     $self->{column}++;
8582     $self->{nc}
8583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8584     } else {
8585     $self->{set_nc}->($self);
8586     }
8587    
8588 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8589 wakaba 1.18 redo A;
8590     } elsif ($self->{nc} == -1) {
8591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8592     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8593    
8594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8595     $self->{line_prev} = $self->{line};
8596     $self->{column_prev} = $self->{column};
8597     $self->{column}++;
8598     $self->{nc}
8599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8600     } else {
8601     $self->{set_nc}->($self);
8602     }
8603    
8604 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8605 wakaba 1.18 redo A;
8606     } else {
8607 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8608 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8609     ## Reconsume.
8610     redo A;
8611     }
8612 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8613     if ($self->{nc} == 0x003E) { # >
8614     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8615    
8616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617     $self->{line_prev} = $self->{line};
8618     $self->{column_prev} = $self->{column};
8619     $self->{column}++;
8620     $self->{nc}
8621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622     } else {
8623     $self->{set_nc}->($self);
8624     }
8625    
8626     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8627     redo A;
8628     } elsif ($self->{nc} == -1) {
8629     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8630     ## Reconsume.
8631     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8632     redo A;
8633     } else {
8634     ## Stay in the state.
8635    
8636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8637     $self->{line_prev} = $self->{line};
8638     $self->{column_prev} = $self->{column};
8639     $self->{column}++;
8640     $self->{nc}
8641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8642     } else {
8643     $self->{set_nc}->($self);
8644     }
8645    
8646     redo A;
8647     }
8648 wakaba 1.1 } else {
8649     die "$0: $self->{state}: Unknown state";
8650     }
8651     } # A
8652    
8653     die "$0: _get_next_token: unexpected case";
8654     } # _get_next_token
8655    
8656     1;
8657 wakaba 1.27 ## $Date: 2009/07/02 21:42:43 $
8658 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24