/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (hide annotations) (download)
Sat Sep 5 09:57:55 2009 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.31: +100 -5 lines
++ whatpm/t/ChangeLog	5 Sep 2009 09:57:06 -0000
	* tokenizer-test-1.test: Added test cases for "comment end space
	state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:57:45 -0000
	space state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.32 our $VERSION=do{my @r=(q$Revision: 1.31 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209     0x0D => 0x000A,
210     0x80 => 0x20AC,
211     0x81 => 0xFFFD,
212     0x82 => 0x201A,
213     0x83 => 0x0192,
214     0x84 => 0x201E,
215     0x85 => 0x2026,
216     0x86 => 0x2020,
217     0x87 => 0x2021,
218     0x88 => 0x02C6,
219     0x89 => 0x2030,
220     0x8A => 0x0160,
221     0x8B => 0x2039,
222     0x8C => 0x0152,
223     0x8D => 0xFFFD,
224     0x8E => 0x017D,
225     0x8F => 0xFFFD,
226     0x90 => 0xFFFD,
227     0x91 => 0x2018,
228     0x92 => 0x2019,
229     0x93 => 0x201C,
230     0x94 => 0x201D,
231     0x95 => 0x2022,
232     0x96 => 0x2013,
233     0x97 => 0x2014,
234     0x98 => 0x02DC,
235     0x99 => 0x2122,
236     0x9A => 0x0161,
237     0x9B => 0x203A,
238     0x9C => 0x0153,
239     0x9D => 0xFFFD,
240     0x9E => 0x017E,
241     0x9F => 0x0178,
242     }; # $charref_map
243     $charref_map->{$_} = 0xFFFD
244     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251    
252     ## Implementations MUST act as if state machine in the spec
253    
254     sub _initialize_tokenizer ($) {
255     my $self = shift;
256    
257     ## NOTE: Fields set by |new| constructor:
258     #$self->{level}
259     #$self->{set_nc}
260     #$self->{parse_error}
261 wakaba 1.3 #$self->{is_xml} (if XML)
262 wakaba 1.1
263     $self->{state} = DATA_STATE; # MUST
264 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
265     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 wakaba 1.1 #$self->{entity__value}; # initialized when used
267     #$self->{entity__match}; # initialized when used
268     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269     undef $self->{ct}; # current token
270     undef $self->{ca}; # current attribute
271     undef $self->{last_stag_name}; # last emitted start tag name
272     #$self->{prev_state}; # initialized when used
273     delete $self->{self_closing};
274     $self->{char_buffer} = '';
275     $self->{char_buffer_pos} = 0;
276     $self->{nc} = -1; # next input character
277     #$self->{next_nc}
278    
279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
280     $self->{line_prev} = $self->{line};
281     $self->{column_prev} = $self->{column};
282     $self->{column}++;
283     $self->{nc}
284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
285     } else {
286     $self->{set_nc}->($self);
287     }
288    
289     $self->{token} = [];
290     # $self->{escape}
291     } # _initialize_tokenizer
292    
293     ## A token has:
294     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
297     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298 wakaba 1.11 ## ->{target} (PI_TOKEN)
299 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
300     ## ->{sysid} (DOCTYPE_TOKEN)
301     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
302     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
303     ## ->{name}
304     ## ->{value}
305     ## ->{has_reference} == 1 or 0
306 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
307     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311    
312 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
314     ## while the token is pushed back to the stack.
315    
316     ## Emitted token MUST immediately be handled by the tree construction state.
317    
318     ## Before each step, UA MAY check to see if either one of the scripts in
319     ## "list of scripts that will execute as soon as possible" or the first
320     ## script in the "list of scripts that will execute asynchronously",
321     ## has completed loading. If one has, then it MUST be executed
322     ## and removed from the list.
323    
324     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
325     ## (This requirement was dropped from HTML5 spec, unfortunately.)
326    
327     my $is_space = {
328     0x0009 => 1, # CHARACTER TABULATION (HT)
329     0x000A => 1, # LINE FEED (LF)
330     #0x000B => 0, # LINE TABULATION (VT)
331 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
333     0x0020 => 1, # SPACE (SP)
334     };
335    
336     sub _get_next_token ($) {
337     my $self = shift;
338    
339     if ($self->{self_closing}) {
340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
341     ## NOTE: The |self_closing| flag is only set by start tag token.
342     ## In addition, when a start tag token is emitted, it is always set to
343     ## |ct|.
344     delete $self->{self_closing};
345     }
346    
347     if (@{$self->{token}}) {
348     $self->{self_closing} = $self->{token}->[0]->{self_closing};
349     return shift @{$self->{token}};
350     }
351    
352     A: {
353     if ($self->{state} == PCDATA_STATE) {
354     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
355    
356     if ($self->{nc} == 0x0026) { # &
357    
358     ## NOTE: In the spec, the tokenizer is switched to the
359     ## "entity data state". In this implementation, the tokenizer
360     ## is switched to the |ENTITY_STATE|, which is an implementation
361     ## of the "consume a character reference" algorithm.
362     $self->{entity_add} = -1;
363     $self->{prev_state} = DATA_STATE;
364     $self->{state} = ENTITY_STATE;
365    
366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
367     $self->{line_prev} = $self->{line};
368     $self->{column_prev} = $self->{column};
369     $self->{column}++;
370     $self->{nc}
371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
372     } else {
373     $self->{set_nc}->($self);
374     }
375    
376     redo A;
377     } elsif ($self->{nc} == 0x003C) { # <
378    
379     $self->{state} = TAG_OPEN_STATE;
380    
381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
382     $self->{line_prev} = $self->{line};
383     $self->{column_prev} = $self->{column};
384     $self->{column}++;
385     $self->{nc}
386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
387     } else {
388     $self->{set_nc}->($self);
389     }
390    
391     redo A;
392     } elsif ($self->{nc} == -1) {
393    
394     return ({type => END_OF_FILE_TOKEN,
395     line => $self->{line}, column => $self->{column}});
396     last A; ## TODO: ok?
397     } else {
398    
399     #
400     }
401    
402     # Anything else
403     my $token = {type => CHARACTER_TOKEN,
404     data => chr $self->{nc},
405     line => $self->{line}, column => $self->{column},
406     };
407     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
408    
409     ## Stay in the state.
410    
411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
412     $self->{line_prev} = $self->{line};
413     $self->{column_prev} = $self->{column};
414     $self->{column}++;
415     $self->{nc}
416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
417     } else {
418     $self->{set_nc}->($self);
419     }
420    
421     return ($token);
422     redo A;
423     } elsif ($self->{state} == DATA_STATE) {
424     $self->{s_kwd} = '' unless defined $self->{s_kwd};
425     if ($self->{nc} == 0x0026) { # &
426     $self->{s_kwd} = '';
427     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
428     not $self->{escape}) {
429    
430     ## NOTE: In the spec, the tokenizer is switched to the
431     ## "entity data state". In this implementation, the tokenizer
432     ## is switched to the |ENTITY_STATE|, which is an implementation
433     ## of the "consume a character reference" algorithm.
434     $self->{entity_add} = -1;
435     $self->{prev_state} = DATA_STATE;
436     $self->{state} = ENTITY_STATE;
437    
438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
439     $self->{line_prev} = $self->{line};
440     $self->{column_prev} = $self->{column};
441     $self->{column}++;
442     $self->{nc}
443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
444     } else {
445     $self->{set_nc}->($self);
446     }
447    
448     redo A;
449     } else {
450    
451     #
452     }
453     } elsif ($self->{nc} == 0x002D) { # -
454     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
455 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
456 wakaba 1.1
457     $self->{escape} = 1; # unless $self->{escape};
458     $self->{s_kwd} = '--';
459     #
460 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
461 wakaba 1.1
462     $self->{s_kwd} = '--';
463     #
464 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
465    
466     $self->{s_kwd} .= '-';
467     #
468 wakaba 1.1 } else {
469    
470 wakaba 1.5 $self->{s_kwd} = '-';
471 wakaba 1.1 #
472     }
473     }
474    
475     #
476     } elsif ($self->{nc} == 0x0021) { # !
477     if (length $self->{s_kwd}) {
478    
479     $self->{s_kwd} .= '!';
480     #
481     } else {
482    
483     #$self->{s_kwd} = '';
484     #
485     }
486     #
487     } elsif ($self->{nc} == 0x003C) { # <
488     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
489     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
490     not $self->{escape})) {
491    
492     $self->{state} = TAG_OPEN_STATE;
493    
494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
495     $self->{line_prev} = $self->{line};
496     $self->{column_prev} = $self->{column};
497     $self->{column}++;
498     $self->{nc}
499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
500     } else {
501     $self->{set_nc}->($self);
502     }
503    
504     redo A;
505     } else {
506    
507     $self->{s_kwd} = '';
508     #
509     }
510     } elsif ($self->{nc} == 0x003E) { # >
511     if ($self->{escape} and
512     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
513     if ($self->{s_kwd} eq '--') {
514    
515     delete $self->{escape};
516 wakaba 1.5 #
517 wakaba 1.1 } else {
518    
519 wakaba 1.5 #
520 wakaba 1.1 }
521 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
522    
523     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
524     line => $self->{line_prev},
525     column => $self->{column_prev} - 1);
526     #
527 wakaba 1.1 } else {
528    
529 wakaba 1.5 #
530 wakaba 1.1 }
531    
532     $self->{s_kwd} = '';
533     #
534 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
535     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
536    
537     $self->{s_kwd} .= ']';
538     } elsif ($self->{s_kwd} eq ']]') {
539    
540     #
541     } else {
542    
543     $self->{s_kwd} = '';
544     }
545     #
546 wakaba 1.1 } elsif ($self->{nc} == -1) {
547    
548     $self->{s_kwd} = '';
549     return ({type => END_OF_FILE_TOKEN,
550     line => $self->{line}, column => $self->{column}});
551     last A; ## TODO: ok?
552     } else {
553    
554     $self->{s_kwd} = '';
555     #
556     }
557    
558     # Anything else
559     my $token = {type => CHARACTER_TOKEN,
560     data => chr $self->{nc},
561     line => $self->{line}, column => $self->{column},
562     };
563 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
564 wakaba 1.1 length $token->{data})) {
565     $self->{s_kwd} = '';
566     }
567    
568     ## Stay in the data state.
569 wakaba 1.5 if (not $self->{is_xml} and
570     $self->{content_model} == PCDATA_CONTENT_MODEL) {
571 wakaba 1.1
572     $self->{state} = PCDATA_STATE;
573     } else {
574    
575     ## Stay in the state.
576     }
577    
578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
579     $self->{line_prev} = $self->{line};
580     $self->{column_prev} = $self->{column};
581     $self->{column}++;
582     $self->{nc}
583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
584     } else {
585     $self->{set_nc}->($self);
586     }
587    
588     return ($token);
589     redo A;
590     } elsif ($self->{state} == TAG_OPEN_STATE) {
591 wakaba 1.10 ## XML5: "tag state".
592    
593 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594     if ($self->{nc} == 0x002F) { # /
595    
596    
597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
598     $self->{line_prev} = $self->{line};
599     $self->{column_prev} = $self->{column};
600     $self->{column}++;
601     $self->{nc}
602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
603     } else {
604     $self->{set_nc}->($self);
605     }
606    
607     $self->{state} = CLOSE_TAG_OPEN_STATE;
608     redo A;
609     } elsif ($self->{nc} == 0x0021) { # !
610    
611 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
612 wakaba 1.1 #
613     } else {
614    
615 wakaba 1.12 $self->{s_kwd} = '';
616 wakaba 1.1 #
617     }
618    
619     ## reconsume
620     $self->{state} = DATA_STATE;
621     return ({type => CHARACTER_TOKEN, data => '<',
622     line => $self->{line_prev},
623     column => $self->{column_prev},
624     });
625     redo A;
626     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
627     if ($self->{nc} == 0x0021) { # !
628    
629     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
630    
631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
632     $self->{line_prev} = $self->{line};
633     $self->{column_prev} = $self->{column};
634     $self->{column}++;
635     $self->{nc}
636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
637     } else {
638     $self->{set_nc}->($self);
639     }
640    
641     redo A;
642     } elsif ($self->{nc} == 0x002F) { # /
643    
644     $self->{state} = CLOSE_TAG_OPEN_STATE;
645    
646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
647     $self->{line_prev} = $self->{line};
648     $self->{column_prev} = $self->{column};
649     $self->{column}++;
650     $self->{nc}
651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
652     } else {
653     $self->{set_nc}->($self);
654     }
655    
656     redo A;
657     } elsif (0x0041 <= $self->{nc} and
658     $self->{nc} <= 0x005A) { # A..Z
659    
660     $self->{ct}
661     = {type => START_TAG_TOKEN,
662 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
663 wakaba 1.1 line => $self->{line_prev},
664     column => $self->{column_prev}};
665     $self->{state} = TAG_NAME_STATE;
666    
667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668     $self->{line_prev} = $self->{line};
669     $self->{column_prev} = $self->{column};
670     $self->{column}++;
671     $self->{nc}
672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673     } else {
674     $self->{set_nc}->($self);
675     }
676    
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680    
681     $self->{ct} = {type => START_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $self->{line_prev},
684     column => $self->{column_prev}};
685     $self->{state} = TAG_NAME_STATE;
686    
687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
688     $self->{line_prev} = $self->{line};
689     $self->{column_prev} = $self->{column};
690     $self->{column}++;
691     $self->{nc}
692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
693     } else {
694     $self->{set_nc}->($self);
695     }
696    
697     redo A;
698     } elsif ($self->{nc} == 0x003E) { # >
699    
700     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
701     line => $self->{line_prev},
702     column => $self->{column_prev});
703     $self->{state} = DATA_STATE;
704 wakaba 1.5 $self->{s_kwd} = '';
705 wakaba 1.1
706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
707     $self->{line_prev} = $self->{line};
708     $self->{column_prev} = $self->{column};
709     $self->{column}++;
710     $self->{nc}
711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
712     } else {
713     $self->{set_nc}->($self);
714     }
715    
716    
717     return ({type => CHARACTER_TOKEN, data => '<>',
718     line => $self->{line_prev},
719     column => $self->{column_prev},
720     });
721    
722     redo A;
723     } elsif ($self->{nc} == 0x003F) { # ?
724 wakaba 1.8 if ($self->{is_xml}) {
725    
726     $self->{state} = PI_STATE;
727    
728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729     $self->{line_prev} = $self->{line};
730     $self->{column_prev} = $self->{column};
731     $self->{column}++;
732     $self->{nc}
733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734     } else {
735     $self->{set_nc}->($self);
736     }
737    
738     redo A;
739     } else {
740    
741     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742     line => $self->{line_prev},
743     column => $self->{column_prev});
744     $self->{state} = BOGUS_COMMENT_STATE;
745     $self->{ct} = {type => COMMENT_TOKEN, data => '',
746     line => $self->{line_prev},
747     column => $self->{column_prev},
748     };
749     ## $self->{nc} is intentionally left as is
750     redo A;
751     }
752 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753 wakaba 1.1
754     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755     line => $self->{line_prev},
756     column => $self->{column_prev});
757     $self->{state} = DATA_STATE;
758 wakaba 1.5 $self->{s_kwd} = '';
759 wakaba 1.1 ## reconsume
760    
761     return ({type => CHARACTER_TOKEN, data => '<',
762     line => $self->{line_prev},
763     column => $self->{column_prev},
764     });
765    
766     redo A;
767 wakaba 1.9 } else {
768     ## XML5: "<:" is a parse error.
769    
770     $self->{ct} = {type => START_TAG_TOKEN,
771     tag_name => chr ($self->{nc}),
772     line => $self->{line_prev},
773     column => $self->{column_prev}};
774     $self->{state} = TAG_NAME_STATE;
775    
776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777     $self->{line_prev} = $self->{line};
778     $self->{column_prev} = $self->{column};
779     $self->{column}++;
780     $self->{nc}
781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782     } else {
783     $self->{set_nc}->($self);
784     }
785    
786     redo A;
787 wakaba 1.1 }
788     } else {
789     die "$0: $self->{content_model} in tag open";
790     }
791     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
792     ## NOTE: The "close tag open state" in the spec is implemented as
793     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794    
795 wakaba 1.10 ## XML5: "end tag state".
796    
797 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799     if (defined $self->{last_stag_name}) {
800     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801 wakaba 1.12 $self->{kwd} = '';
802 wakaba 1.1 ## Reconsume.
803     redo A;
804     } else {
805     ## No start tag token has ever been emitted
806     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
807    
808     $self->{state} = DATA_STATE;
809 wakaba 1.5 $self->{s_kwd} = '';
810 wakaba 1.1 ## Reconsume.
811     return ({type => CHARACTER_TOKEN, data => '</',
812     line => $l, column => $c,
813     });
814     redo A;
815     }
816     }
817    
818     if (0x0041 <= $self->{nc} and
819     $self->{nc} <= 0x005A) { # A..Z
820    
821     $self->{ct}
822     = {type => END_TAG_TOKEN,
823 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
824 wakaba 1.1 line => $l, column => $c};
825     $self->{state} = TAG_NAME_STATE;
826    
827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
828     $self->{line_prev} = $self->{line};
829     $self->{column_prev} = $self->{column};
830     $self->{column}++;
831     $self->{nc}
832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
833     } else {
834     $self->{set_nc}->($self);
835     }
836    
837     redo A;
838     } elsif (0x0061 <= $self->{nc} and
839     $self->{nc} <= 0x007A) { # a..z
840    
841     $self->{ct} = {type => END_TAG_TOKEN,
842     tag_name => chr ($self->{nc}),
843     line => $l, column => $c};
844     $self->{state} = TAG_NAME_STATE;
845    
846     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
847     $self->{line_prev} = $self->{line};
848     $self->{column_prev} = $self->{column};
849     $self->{column}++;
850     $self->{nc}
851     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
852     } else {
853     $self->{set_nc}->($self);
854     }
855    
856     redo A;
857     } elsif ($self->{nc} == 0x003E) { # >
858     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859     line => $self->{line_prev}, ## "<" in "</>"
860     column => $self->{column_prev} - 1);
861     $self->{state} = DATA_STATE;
862 wakaba 1.5 $self->{s_kwd} = '';
863 wakaba 1.10 if ($self->{is_xml}) {
864    
865     ## XML5: No parse error.
866    
867     ## NOTE: This parser raises a parse error, since it supports
868     ## XML1, not XML5.
869    
870     ## NOTE: A short end tag token.
871     my $ct = {type => END_TAG_TOKEN,
872     tag_name => '',
873     line => $self->{line_prev},
874     column => $self->{column_prev} - 1,
875     };
876    
877     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878     $self->{line_prev} = $self->{line};
879     $self->{column_prev} = $self->{column};
880     $self->{column}++;
881     $self->{nc}
882     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883     } else {
884     $self->{set_nc}->($self);
885     }
886    
887     return ($ct);
888     } else {
889    
890    
891 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892     $self->{line_prev} = $self->{line};
893     $self->{column_prev} = $self->{column};
894     $self->{column}++;
895     $self->{nc}
896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
897     } else {
898     $self->{set_nc}->($self);
899     }
900    
901 wakaba 1.10 }
902 wakaba 1.1 redo A;
903     } elsif ($self->{nc} == -1) {
904    
905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
906 wakaba 1.5 $self->{s_kwd} = '';
907 wakaba 1.1 $self->{state} = DATA_STATE;
908     # reconsume
909    
910     return ({type => CHARACTER_TOKEN, data => '</',
911     line => $l, column => $c,
912     });
913    
914     redo A;
915 wakaba 1.10 } elsif (not $self->{is_xml} or
916     $is_space->{$self->{nc}}) {
917 wakaba 1.1
918 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919     line => $self->{line_prev}, # "<" of "</"
920     column => $self->{column_prev} - 1);
921 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
922     $self->{ct} = {type => COMMENT_TOKEN, data => '',
923     line => $self->{line_prev}, # "<" of "</"
924     column => $self->{column_prev} - 1,
925     };
926     ## NOTE: $self->{nc} is intentionally left as is.
927     ## Although the "anything else" case of the spec not explicitly
928     ## states that the next input character is to be reconsumed,
929     ## it will be included to the |data| of the comment token
930     ## generated from the bogus end tag, as defined in the
931     ## "bogus comment state" entry.
932     redo A;
933 wakaba 1.10 } else {
934     ## XML5: "</:" is a parse error.
935    
936     $self->{ct} = {type => END_TAG_TOKEN,
937     tag_name => chr ($self->{nc}),
938     line => $l, column => $c};
939     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940    
941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942     $self->{line_prev} = $self->{line};
943     $self->{column_prev} = $self->{column};
944     $self->{column}++;
945     $self->{nc}
946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947     } else {
948     $self->{set_nc}->($self);
949     }
950    
951     redo A;
952 wakaba 1.1 }
953     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955 wakaba 1.1 if (length $ch) {
956     my $CH = $ch;
957     $ch =~ tr/a-z/A-Z/;
958     my $nch = chr $self->{nc};
959     if ($nch eq $ch or $nch eq $CH) {
960    
961     ## Stay in the state.
962 wakaba 1.12 $self->{kwd} .= $nch;
963 wakaba 1.1
964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965     $self->{line_prev} = $self->{line};
966     $self->{column_prev} = $self->{column};
967     $self->{column}++;
968     $self->{nc}
969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
970     } else {
971     $self->{set_nc}->($self);
972     }
973    
974     redo A;
975     } else {
976    
977     $self->{state} = DATA_STATE;
978 wakaba 1.5 $self->{s_kwd} = '';
979 wakaba 1.1 ## Reconsume.
980     return ({type => CHARACTER_TOKEN,
981 wakaba 1.12 data => '</' . $self->{kwd},
982 wakaba 1.1 line => $self->{line_prev},
983 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
984 wakaba 1.1 });
985     redo A;
986     }
987     } else { # after "<{tag-name}"
988     unless ($is_space->{$self->{nc}} or
989     {
990     0x003E => 1, # >
991     0x002F => 1, # /
992     -1 => 1, # EOF
993     }->{$self->{nc}}) {
994    
995     ## Reconsume.
996     $self->{state} = DATA_STATE;
997 wakaba 1.5 $self->{s_kwd} = '';
998 wakaba 1.1 return ({type => CHARACTER_TOKEN,
999 wakaba 1.12 data => '</' . $self->{kwd},
1000 wakaba 1.1 line => $self->{line_prev},
1001 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1002 wakaba 1.1 });
1003     redo A;
1004     } else {
1005    
1006     $self->{ct}
1007     = {type => END_TAG_TOKEN,
1008     tag_name => $self->{last_stag_name},
1009     line => $self->{line_prev},
1010 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1011 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1012     ## Reconsume.
1013     redo A;
1014     }
1015     }
1016     } elsif ($self->{state} == TAG_NAME_STATE) {
1017     if ($is_space->{$self->{nc}}) {
1018    
1019     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1020    
1021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022     $self->{line_prev} = $self->{line};
1023     $self->{column_prev} = $self->{column};
1024     $self->{column}++;
1025     $self->{nc}
1026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1027     } else {
1028     $self->{set_nc}->($self);
1029     }
1030    
1031     redo A;
1032     } elsif ($self->{nc} == 0x003E) { # >
1033     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1034    
1035     $self->{last_stag_name} = $self->{ct}->{tag_name};
1036     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1037     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1038     #if ($self->{ct}->{attributes}) {
1039     # ## NOTE: This should never be reached.
1040     # !!! cp (36);
1041     # !!! parse-error (type => 'end tag attribute');
1042     #} else {
1043    
1044     #}
1045     } else {
1046     die "$0: $self->{ct}->{type}: Unknown token type";
1047     }
1048     $self->{state} = DATA_STATE;
1049 wakaba 1.5 $self->{s_kwd} = '';
1050 wakaba 1.1
1051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052     $self->{line_prev} = $self->{line};
1053     $self->{column_prev} = $self->{column};
1054     $self->{column}++;
1055     $self->{nc}
1056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057     } else {
1058     $self->{set_nc}->($self);
1059     }
1060    
1061    
1062     return ($self->{ct}); # start tag or end tag
1063    
1064     redo A;
1065     } elsif (0x0041 <= $self->{nc} and
1066     $self->{nc} <= 0x005A) { # A..Z
1067    
1068 wakaba 1.4 $self->{ct}->{tag_name}
1069     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1070 wakaba 1.1 # start tag or end tag
1071     ## Stay in this state
1072    
1073     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1074     $self->{line_prev} = $self->{line};
1075     $self->{column_prev} = $self->{column};
1076     $self->{column}++;
1077     $self->{nc}
1078     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1079     } else {
1080     $self->{set_nc}->($self);
1081     }
1082    
1083     redo A;
1084     } elsif ($self->{nc} == -1) {
1085     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1086     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087    
1088     $self->{last_stag_name} = $self->{ct}->{tag_name};
1089     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091     #if ($self->{ct}->{attributes}) {
1092     # ## NOTE: This state should never be reached.
1093     # !!! cp (40);
1094     # !!! parse-error (type => 'end tag attribute');
1095     #} else {
1096    
1097     #}
1098     } else {
1099     die "$0: $self->{ct}->{type}: Unknown token type";
1100     }
1101     $self->{state} = DATA_STATE;
1102 wakaba 1.5 $self->{s_kwd} = '';
1103 wakaba 1.1 # reconsume
1104    
1105     return ($self->{ct}); # start tag or end tag
1106    
1107     redo A;
1108     } elsif ($self->{nc} == 0x002F) { # /
1109    
1110     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1111    
1112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1113     $self->{line_prev} = $self->{line};
1114     $self->{column_prev} = $self->{column};
1115     $self->{column}++;
1116     $self->{nc}
1117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1118     } else {
1119     $self->{set_nc}->($self);
1120     }
1121    
1122     redo A;
1123     } else {
1124    
1125     $self->{ct}->{tag_name} .= chr $self->{nc};
1126     # start tag or end tag
1127     ## Stay in the state
1128    
1129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1130     $self->{line_prev} = $self->{line};
1131     $self->{column_prev} = $self->{column};
1132     $self->{column}++;
1133     $self->{nc}
1134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1135     } else {
1136     $self->{set_nc}->($self);
1137     }
1138    
1139     redo A;
1140     }
1141     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1142 wakaba 1.11 ## XML5: "Tag attribute name before state".
1143    
1144 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1145    
1146     ## Stay in the state
1147    
1148     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1149     $self->{line_prev} = $self->{line};
1150     $self->{column_prev} = $self->{column};
1151     $self->{column}++;
1152     $self->{nc}
1153     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1154     } else {
1155     $self->{set_nc}->($self);
1156     }
1157    
1158     redo A;
1159     } elsif ($self->{nc} == 0x003E) { # >
1160     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1161    
1162     $self->{last_stag_name} = $self->{ct}->{tag_name};
1163     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1164     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1165     if ($self->{ct}->{attributes}) {
1166    
1167     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1168     } else {
1169    
1170     }
1171     } else {
1172     die "$0: $self->{ct}->{type}: Unknown token type";
1173     }
1174     $self->{state} = DATA_STATE;
1175 wakaba 1.5 $self->{s_kwd} = '';
1176 wakaba 1.1
1177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1178     $self->{line_prev} = $self->{line};
1179     $self->{column_prev} = $self->{column};
1180     $self->{column}++;
1181     $self->{nc}
1182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1183     } else {
1184     $self->{set_nc}->($self);
1185     }
1186    
1187    
1188     return ($self->{ct}); # start tag or end tag
1189    
1190     redo A;
1191     } elsif (0x0041 <= $self->{nc} and
1192     $self->{nc} <= 0x005A) { # A..Z
1193    
1194     $self->{ca}
1195 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1196 wakaba 1.1 value => '',
1197     line => $self->{line}, column => $self->{column}};
1198     $self->{state} = ATTRIBUTE_NAME_STATE;
1199    
1200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1201     $self->{line_prev} = $self->{line};
1202     $self->{column_prev} = $self->{column};
1203     $self->{column}++;
1204     $self->{nc}
1205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1206     } else {
1207     $self->{set_nc}->($self);
1208     }
1209    
1210     redo A;
1211     } elsif ($self->{nc} == 0x002F) { # /
1212    
1213     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1214    
1215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1216     $self->{line_prev} = $self->{line};
1217     $self->{column_prev} = $self->{column};
1218     $self->{column}++;
1219     $self->{nc}
1220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1221     } else {
1222     $self->{set_nc}->($self);
1223     }
1224    
1225     redo A;
1226     } elsif ($self->{nc} == -1) {
1227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1228     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1229    
1230     $self->{last_stag_name} = $self->{ct}->{tag_name};
1231     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1232     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1233     if ($self->{ct}->{attributes}) {
1234    
1235     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1236     } else {
1237    
1238     }
1239     } else {
1240     die "$0: $self->{ct}->{type}: Unknown token type";
1241     }
1242     $self->{state} = DATA_STATE;
1243 wakaba 1.5 $self->{s_kwd} = '';
1244 wakaba 1.1 # reconsume
1245    
1246     return ($self->{ct}); # start tag or end tag
1247    
1248     redo A;
1249     } else {
1250     if ({
1251     0x0022 => 1, # "
1252     0x0027 => 1, # '
1253 wakaba 1.30 0x003C => 1, # <
1254 wakaba 1.1 0x003D => 1, # =
1255     }->{$self->{nc}}) {
1256    
1257 wakaba 1.11 ## XML5: Not a parse error.
1258 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1259     } else {
1260    
1261 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1262 wakaba 1.1 }
1263     $self->{ca}
1264     = {name => chr ($self->{nc}),
1265     value => '',
1266     line => $self->{line}, column => $self->{column}};
1267     $self->{state} = ATTRIBUTE_NAME_STATE;
1268    
1269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1270     $self->{line_prev} = $self->{line};
1271     $self->{column_prev} = $self->{column};
1272     $self->{column}++;
1273     $self->{nc}
1274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1275     } else {
1276     $self->{set_nc}->($self);
1277     }
1278    
1279     redo A;
1280     }
1281     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1282 wakaba 1.11 ## XML5: "Tag attribute name state".
1283    
1284 wakaba 1.1 my $before_leave = sub {
1285     if (exists $self->{ct}->{attributes} # start tag or end tag
1286     ->{$self->{ca}->{name}}) { # MUST
1287    
1288     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1289     ## Discard $self->{ca} # MUST
1290     } else {
1291    
1292     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1293     = $self->{ca};
1294 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1295 wakaba 1.1 }
1296     }; # $before_leave
1297    
1298     if ($is_space->{$self->{nc}}) {
1299    
1300     $before_leave->();
1301     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1302    
1303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1304     $self->{line_prev} = $self->{line};
1305     $self->{column_prev} = $self->{column};
1306     $self->{column}++;
1307     $self->{nc}
1308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1309     } else {
1310     $self->{set_nc}->($self);
1311     }
1312    
1313     redo A;
1314     } elsif ($self->{nc} == 0x003D) { # =
1315    
1316     $before_leave->();
1317     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1318    
1319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1320     $self->{line_prev} = $self->{line};
1321     $self->{column_prev} = $self->{column};
1322     $self->{column}++;
1323     $self->{nc}
1324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1325     } else {
1326     $self->{set_nc}->($self);
1327     }
1328    
1329     redo A;
1330     } elsif ($self->{nc} == 0x003E) { # >
1331 wakaba 1.11 if ($self->{is_xml}) {
1332    
1333     ## XML5: Not a parse error.
1334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1335     } else {
1336    
1337     }
1338    
1339 wakaba 1.1 $before_leave->();
1340     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1341    
1342     $self->{last_stag_name} = $self->{ct}->{tag_name};
1343     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1344    
1345     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1346     if ($self->{ct}->{attributes}) {
1347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1348     }
1349     } else {
1350     die "$0: $self->{ct}->{type}: Unknown token type";
1351     }
1352     $self->{state} = DATA_STATE;
1353 wakaba 1.5 $self->{s_kwd} = '';
1354 wakaba 1.1
1355     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1356     $self->{line_prev} = $self->{line};
1357     $self->{column_prev} = $self->{column};
1358     $self->{column}++;
1359     $self->{nc}
1360     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1361     } else {
1362     $self->{set_nc}->($self);
1363     }
1364    
1365    
1366     return ($self->{ct}); # start tag or end tag
1367    
1368     redo A;
1369     } elsif (0x0041 <= $self->{nc} and
1370     $self->{nc} <= 0x005A) { # A..Z
1371    
1372 wakaba 1.4 $self->{ca}->{name}
1373     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1374 wakaba 1.1 ## Stay in the state
1375    
1376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1377     $self->{line_prev} = $self->{line};
1378     $self->{column_prev} = $self->{column};
1379     $self->{column}++;
1380     $self->{nc}
1381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1382     } else {
1383     $self->{set_nc}->($self);
1384     }
1385    
1386     redo A;
1387     } elsif ($self->{nc} == 0x002F) { # /
1388 wakaba 1.11 if ($self->{is_xml}) {
1389    
1390     ## XML5: Not a parse error.
1391     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1392     } else {
1393    
1394     }
1395 wakaba 1.1
1396     $before_leave->();
1397     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1398    
1399     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1400     $self->{line_prev} = $self->{line};
1401     $self->{column_prev} = $self->{column};
1402     $self->{column}++;
1403     $self->{nc}
1404     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1405     } else {
1406     $self->{set_nc}->($self);
1407     }
1408    
1409     redo A;
1410     } elsif ($self->{nc} == -1) {
1411     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1412     $before_leave->();
1413     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1414    
1415     $self->{last_stag_name} = $self->{ct}->{tag_name};
1416     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1417     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418     if ($self->{ct}->{attributes}) {
1419    
1420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1421     } else {
1422     ## NOTE: This state should never be reached.
1423    
1424     }
1425     } else {
1426     die "$0: $self->{ct}->{type}: Unknown token type";
1427     }
1428     $self->{state} = DATA_STATE;
1429 wakaba 1.5 $self->{s_kwd} = '';
1430 wakaba 1.1 # reconsume
1431    
1432     return ($self->{ct}); # start tag or end tag
1433    
1434     redo A;
1435     } else {
1436 wakaba 1.30 if ({
1437     0x0022 => 1, # "
1438     0x0027 => 1, # '
1439     0x003C => 1, # <
1440     }->{$self->{nc}}) {
1441 wakaba 1.1
1442 wakaba 1.11 ## XML5: Not a parse error.
1443 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1444     } else {
1445    
1446     }
1447     $self->{ca}->{name} .= chr ($self->{nc});
1448     ## Stay in the state
1449    
1450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1451     $self->{line_prev} = $self->{line};
1452     $self->{column_prev} = $self->{column};
1453     $self->{column}++;
1454     $self->{nc}
1455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1456     } else {
1457     $self->{set_nc}->($self);
1458     }
1459    
1460     redo A;
1461     }
1462     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1463 wakaba 1.11 ## XML5: "Tag attribute name after state".
1464    
1465 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1466    
1467     ## Stay in the state
1468    
1469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1470     $self->{line_prev} = $self->{line};
1471     $self->{column_prev} = $self->{column};
1472     $self->{column}++;
1473     $self->{nc}
1474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1475     } else {
1476     $self->{set_nc}->($self);
1477     }
1478    
1479     redo A;
1480     } elsif ($self->{nc} == 0x003D) { # =
1481    
1482     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1483    
1484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1485     $self->{line_prev} = $self->{line};
1486     $self->{column_prev} = $self->{column};
1487     $self->{column}++;
1488     $self->{nc}
1489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1490     } else {
1491     $self->{set_nc}->($self);
1492     }
1493    
1494     redo A;
1495     } elsif ($self->{nc} == 0x003E) { # >
1496 wakaba 1.11 if ($self->{is_xml}) {
1497    
1498     ## XML5: Not a parse error.
1499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1500     } else {
1501    
1502     }
1503    
1504 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1505    
1506     $self->{last_stag_name} = $self->{ct}->{tag_name};
1507     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1508     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1509     if ($self->{ct}->{attributes}) {
1510    
1511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1512     } else {
1513     ## NOTE: This state should never be reached.
1514    
1515     }
1516     } else {
1517     die "$0: $self->{ct}->{type}: Unknown token type";
1518     }
1519     $self->{state} = DATA_STATE;
1520 wakaba 1.5 $self->{s_kwd} = '';
1521 wakaba 1.1
1522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1523     $self->{line_prev} = $self->{line};
1524     $self->{column_prev} = $self->{column};
1525     $self->{column}++;
1526     $self->{nc}
1527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1528     } else {
1529     $self->{set_nc}->($self);
1530     }
1531    
1532    
1533     return ($self->{ct}); # start tag or end tag
1534    
1535     redo A;
1536     } elsif (0x0041 <= $self->{nc} and
1537     $self->{nc} <= 0x005A) { # A..Z
1538    
1539     $self->{ca}
1540 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1541 wakaba 1.1 value => '',
1542     line => $self->{line}, column => $self->{column}};
1543     $self->{state} = ATTRIBUTE_NAME_STATE;
1544    
1545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1546     $self->{line_prev} = $self->{line};
1547     $self->{column_prev} = $self->{column};
1548     $self->{column}++;
1549     $self->{nc}
1550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1551     } else {
1552     $self->{set_nc}->($self);
1553     }
1554    
1555     redo A;
1556     } elsif ($self->{nc} == 0x002F) { # /
1557 wakaba 1.11 if ($self->{is_xml}) {
1558    
1559     ## XML5: Not a parse error.
1560     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1561     } else {
1562    
1563     }
1564 wakaba 1.1
1565     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1566    
1567     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1568     $self->{line_prev} = $self->{line};
1569     $self->{column_prev} = $self->{column};
1570     $self->{column}++;
1571     $self->{nc}
1572     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1573     } else {
1574     $self->{set_nc}->($self);
1575     }
1576    
1577     redo A;
1578     } elsif ($self->{nc} == -1) {
1579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1580     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1581    
1582     $self->{last_stag_name} = $self->{ct}->{tag_name};
1583     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1584     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1585     if ($self->{ct}->{attributes}) {
1586    
1587     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1588     } else {
1589     ## NOTE: This state should never be reached.
1590    
1591     }
1592     } else {
1593     die "$0: $self->{ct}->{type}: Unknown token type";
1594     }
1595 wakaba 1.5 $self->{s_kwd} = '';
1596 wakaba 1.1 $self->{state} = DATA_STATE;
1597     # reconsume
1598    
1599     return ($self->{ct}); # start tag or end tag
1600    
1601     redo A;
1602     } else {
1603 wakaba 1.11 if ($self->{is_xml}) {
1604    
1605     ## XML5: Not a parse error.
1606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1607     } else {
1608    
1609     }
1610    
1611 wakaba 1.30 if ({
1612     0x0022 => 1, # "
1613     0x0027 => 1, # '
1614     0x003C => 1, # <
1615     }->{$self->{nc}}) {
1616 wakaba 1.1
1617 wakaba 1.11 ## XML5: Not a parse error.
1618 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1619     } else {
1620    
1621     }
1622     $self->{ca}
1623     = {name => chr ($self->{nc}),
1624     value => '',
1625     line => $self->{line}, column => $self->{column}};
1626     $self->{state} = ATTRIBUTE_NAME_STATE;
1627    
1628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629     $self->{line_prev} = $self->{line};
1630     $self->{column_prev} = $self->{column};
1631     $self->{column}++;
1632     $self->{nc}
1633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634     } else {
1635     $self->{set_nc}->($self);
1636     }
1637    
1638     redo A;
1639     }
1640     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1641 wakaba 1.11 ## XML5: "Tag attribute value before state".
1642    
1643 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1644    
1645     ## Stay in the state
1646    
1647     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1648     $self->{line_prev} = $self->{line};
1649     $self->{column_prev} = $self->{column};
1650     $self->{column}++;
1651     $self->{nc}
1652     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1653     } else {
1654     $self->{set_nc}->($self);
1655     }
1656    
1657     redo A;
1658     } elsif ($self->{nc} == 0x0022) { # "
1659    
1660     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1661    
1662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1663     $self->{line_prev} = $self->{line};
1664     $self->{column_prev} = $self->{column};
1665     $self->{column}++;
1666     $self->{nc}
1667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1668     } else {
1669     $self->{set_nc}->($self);
1670     }
1671    
1672     redo A;
1673     } elsif ($self->{nc} == 0x0026) { # &
1674    
1675     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1676     ## reconsume
1677     redo A;
1678     } elsif ($self->{nc} == 0x0027) { # '
1679    
1680     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1681    
1682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1683     $self->{line_prev} = $self->{line};
1684     $self->{column_prev} = $self->{column};
1685     $self->{column}++;
1686     $self->{nc}
1687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1688     } else {
1689     $self->{set_nc}->($self);
1690     }
1691    
1692     redo A;
1693     } elsif ($self->{nc} == 0x003E) { # >
1694     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1695     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1696    
1697     $self->{last_stag_name} = $self->{ct}->{tag_name};
1698     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1699     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1700     if ($self->{ct}->{attributes}) {
1701    
1702     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1703     } else {
1704     ## NOTE: This state should never be reached.
1705    
1706     }
1707     } else {
1708     die "$0: $self->{ct}->{type}: Unknown token type";
1709     }
1710     $self->{state} = DATA_STATE;
1711 wakaba 1.5 $self->{s_kwd} = '';
1712 wakaba 1.1
1713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1714     $self->{line_prev} = $self->{line};
1715     $self->{column_prev} = $self->{column};
1716     $self->{column}++;
1717     $self->{nc}
1718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1719     } else {
1720     $self->{set_nc}->($self);
1721     }
1722    
1723    
1724     return ($self->{ct}); # start tag or end tag
1725    
1726     redo A;
1727     } elsif ($self->{nc} == -1) {
1728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1729     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1730    
1731     $self->{last_stag_name} = $self->{ct}->{tag_name};
1732     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1733     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1734     if ($self->{ct}->{attributes}) {
1735    
1736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1737     } else {
1738     ## NOTE: This state should never be reached.
1739    
1740     }
1741     } else {
1742     die "$0: $self->{ct}->{type}: Unknown token type";
1743     }
1744     $self->{state} = DATA_STATE;
1745 wakaba 1.5 $self->{s_kwd} = '';
1746 wakaba 1.1 ## reconsume
1747    
1748     return ($self->{ct}); # start tag or end tag
1749    
1750     redo A;
1751     } else {
1752 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1753 wakaba 1.1
1754 wakaba 1.11 ## XML5: Not a parse error.
1755 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1756 wakaba 1.11 } elsif ($self->{is_xml}) {
1757    
1758     ## XML5: No parse error.
1759     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1760 wakaba 1.1 } else {
1761    
1762     }
1763     $self->{ca}->{value} .= chr ($self->{nc});
1764     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1765    
1766     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1767     $self->{line_prev} = $self->{line};
1768     $self->{column_prev} = $self->{column};
1769     $self->{column}++;
1770     $self->{nc}
1771     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1772     } else {
1773     $self->{set_nc}->($self);
1774     }
1775    
1776     redo A;
1777     }
1778     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1779 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1780     ## ATTLIST attribute value double quoted state".
1781 wakaba 1.11
1782 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1783 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1784    
1785     ## XML5: "DOCTYPE ATTLIST name after state".
1786     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1787     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1788     } else {
1789    
1790     ## XML5: "Tag attribute name before state".
1791     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1792     }
1793 wakaba 1.1
1794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795     $self->{line_prev} = $self->{line};
1796     $self->{column_prev} = $self->{column};
1797     $self->{column}++;
1798     $self->{nc}
1799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800     } else {
1801     $self->{set_nc}->($self);
1802     }
1803    
1804     redo A;
1805     } elsif ($self->{nc} == 0x0026) { # &
1806    
1807 wakaba 1.11 ## XML5: Not defined yet.
1808    
1809 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1810     ## "entity in attribute value state". In this implementation, the
1811     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812     ## implementation of the "consume a character reference" algorithm.
1813     $self->{prev_state} = $self->{state};
1814     $self->{entity_add} = 0x0022; # "
1815     $self->{state} = ENTITY_STATE;
1816    
1817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818     $self->{line_prev} = $self->{line};
1819     $self->{column_prev} = $self->{column};
1820     $self->{column}++;
1821     $self->{nc}
1822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823     } else {
1824     $self->{set_nc}->($self);
1825     }
1826    
1827     redo A;
1828 wakaba 1.25 } elsif ($self->{is_xml} and
1829     $is_space->{$self->{nc}}) {
1830    
1831     $self->{ca}->{value} .= ' ';
1832     ## Stay in the state.
1833    
1834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1835     $self->{line_prev} = $self->{line};
1836     $self->{column_prev} = $self->{column};
1837     $self->{column}++;
1838     $self->{nc}
1839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1840     } else {
1841     $self->{set_nc}->($self);
1842     }
1843    
1844     redo A;
1845 wakaba 1.1 } elsif ($self->{nc} == -1) {
1846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1847     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1848    
1849     $self->{last_stag_name} = $self->{ct}->{tag_name};
1850 wakaba 1.15
1851     $self->{state} = DATA_STATE;
1852     $self->{s_kwd} = '';
1853     ## reconsume
1854     return ($self->{ct}); # start tag
1855     redo A;
1856 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1857     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1858     if ($self->{ct}->{attributes}) {
1859    
1860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1861     } else {
1862     ## NOTE: This state should never be reached.
1863    
1864     }
1865 wakaba 1.15
1866     $self->{state} = DATA_STATE;
1867     $self->{s_kwd} = '';
1868     ## reconsume
1869     return ($self->{ct}); # end tag
1870     redo A;
1871     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1872     ## XML5: No parse error above; not defined yet.
1873     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1874     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1875     ## Reconsume.
1876     return ($self->{ct}); # ATTLIST
1877     redo A;
1878 wakaba 1.1 } else {
1879     die "$0: $self->{ct}->{type}: Unknown token type";
1880     }
1881     } else {
1882 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1883 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1884    
1885     ## XML5: Not a parse error.
1886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1887     } else {
1888    
1889     }
1890 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1891     $self->{read_until}->($self->{ca}->{value},
1892 wakaba 1.25 qq["&<\x09\x0C\x20],
1893 wakaba 1.1 length $self->{ca}->{value});
1894    
1895     ## Stay in the state
1896    
1897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1898     $self->{line_prev} = $self->{line};
1899     $self->{column_prev} = $self->{column};
1900     $self->{column}++;
1901     $self->{nc}
1902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1903     } else {
1904     $self->{set_nc}->($self);
1905     }
1906    
1907     redo A;
1908     }
1909     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1910 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1911     ## ATTLIST attribute value single quoted state".
1912 wakaba 1.11
1913 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1914 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1915    
1916     ## XML5: "DOCTYPE ATTLIST name after state".
1917     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1918     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1919     } else {
1920    
1921     ## XML5: "Before attribute name state" (sic).
1922     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1923     }
1924 wakaba 1.1
1925     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1926     $self->{line_prev} = $self->{line};
1927     $self->{column_prev} = $self->{column};
1928     $self->{column}++;
1929     $self->{nc}
1930     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1931     } else {
1932     $self->{set_nc}->($self);
1933     }
1934    
1935     redo A;
1936     } elsif ($self->{nc} == 0x0026) { # &
1937    
1938 wakaba 1.11 ## XML5: Not defined yet.
1939    
1940 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1941     ## "entity in attribute value state". In this implementation, the
1942     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1943     ## implementation of the "consume a character reference" algorithm.
1944     $self->{entity_add} = 0x0027; # '
1945     $self->{prev_state} = $self->{state};
1946     $self->{state} = ENTITY_STATE;
1947    
1948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1949     $self->{line_prev} = $self->{line};
1950     $self->{column_prev} = $self->{column};
1951     $self->{column}++;
1952     $self->{nc}
1953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1954     } else {
1955     $self->{set_nc}->($self);
1956     }
1957    
1958     redo A;
1959 wakaba 1.25 } elsif ($self->{is_xml} and
1960     $is_space->{$self->{nc}}) {
1961    
1962     $self->{ca}->{value} .= ' ';
1963     ## Stay in the state.
1964    
1965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1966     $self->{line_prev} = $self->{line};
1967     $self->{column_prev} = $self->{column};
1968     $self->{column}++;
1969     $self->{nc}
1970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1971     } else {
1972     $self->{set_nc}->($self);
1973     }
1974    
1975     redo A;
1976 wakaba 1.1 } elsif ($self->{nc} == -1) {
1977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1978     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1979    
1980     $self->{last_stag_name} = $self->{ct}->{tag_name};
1981 wakaba 1.15
1982     $self->{state} = DATA_STATE;
1983     $self->{s_kwd} = '';
1984     ## reconsume
1985     return ($self->{ct}); # start tag
1986     redo A;
1987 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1988     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1989     if ($self->{ct}->{attributes}) {
1990    
1991     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1992     } else {
1993     ## NOTE: This state should never be reached.
1994    
1995     }
1996 wakaba 1.15
1997     $self->{state} = DATA_STATE;
1998     $self->{s_kwd} = '';
1999     ## reconsume
2000     return ($self->{ct}); # end tag
2001     redo A;
2002     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2003     ## XML5: No parse error above; not defined yet.
2004     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2005     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2006     ## Reconsume.
2007     return ($self->{ct}); # ATTLIST
2008     redo A;
2009 wakaba 1.1 } else {
2010     die "$0: $self->{ct}->{type}: Unknown token type";
2011     }
2012     } else {
2013 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2014 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2015    
2016     ## XML5: Not a parse error.
2017     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2018     } else {
2019    
2020     }
2021 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2022     $self->{read_until}->($self->{ca}->{value},
2023 wakaba 1.25 qq['&<\x09\x0C\x20],
2024 wakaba 1.1 length $self->{ca}->{value});
2025    
2026     ## Stay in the state
2027    
2028     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2029     $self->{line_prev} = $self->{line};
2030     $self->{column_prev} = $self->{column};
2031     $self->{column}++;
2032     $self->{nc}
2033     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2034     } else {
2035     $self->{set_nc}->($self);
2036     }
2037    
2038     redo A;
2039     }
2040     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2041 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2042    
2043 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2044 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2045    
2046     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2047     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2048     } else {
2049    
2050     ## XML5: "Tag attribute name before state".
2051     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2052     }
2053 wakaba 1.1
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     redo A;
2065     } elsif ($self->{nc} == 0x0026) { # &
2066    
2067 wakaba 1.11
2068     ## XML5: Not defined yet.
2069    
2070 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2071     ## "entity in attribute value state". In this implementation, the
2072     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2073     ## implementation of the "consume a character reference" algorithm.
2074     $self->{entity_add} = -1;
2075     $self->{prev_state} = $self->{state};
2076     $self->{state} = ENTITY_STATE;
2077    
2078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079     $self->{line_prev} = $self->{line};
2080     $self->{column_prev} = $self->{column};
2081     $self->{column}++;
2082     $self->{nc}
2083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084     } else {
2085     $self->{set_nc}->($self);
2086     }
2087    
2088     redo A;
2089     } elsif ($self->{nc} == 0x003E) { # >
2090     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2091    
2092     $self->{last_stag_name} = $self->{ct}->{tag_name};
2093 wakaba 1.15
2094     $self->{state} = DATA_STATE;
2095     $self->{s_kwd} = '';
2096    
2097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2098     $self->{line_prev} = $self->{line};
2099     $self->{column_prev} = $self->{column};
2100     $self->{column}++;
2101     $self->{nc}
2102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2103     } else {
2104     $self->{set_nc}->($self);
2105     }
2106    
2107     return ($self->{ct}); # start tag
2108     redo A;
2109 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2110     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2111     if ($self->{ct}->{attributes}) {
2112    
2113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2114     } else {
2115     ## NOTE: This state should never be reached.
2116    
2117     }
2118 wakaba 1.15
2119     $self->{state} = DATA_STATE;
2120     $self->{s_kwd} = '';
2121    
2122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2123     $self->{line_prev} = $self->{line};
2124     $self->{column_prev} = $self->{column};
2125     $self->{column}++;
2126     $self->{nc}
2127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2128     } else {
2129     $self->{set_nc}->($self);
2130     }
2131    
2132     return ($self->{ct}); # end tag
2133     redo A;
2134     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2135     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2137    
2138 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2139     $self->{line_prev} = $self->{line};
2140     $self->{column_prev} = $self->{column};
2141     $self->{column}++;
2142     $self->{nc}
2143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2144     } else {
2145     $self->{set_nc}->($self);
2146     }
2147    
2148 wakaba 1.15 return ($self->{ct}); # ATTLIST
2149     redo A;
2150     } else {
2151     die "$0: $self->{ct}->{type}: Unknown token type";
2152     }
2153 wakaba 1.1 } elsif ($self->{nc} == -1) {
2154     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2155    
2156 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2158 wakaba 1.15
2159     $self->{state} = DATA_STATE;
2160     $self->{s_kwd} = '';
2161     ## reconsume
2162     return ($self->{ct}); # start tag
2163     redo A;
2164 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2165 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2166 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2167     if ($self->{ct}->{attributes}) {
2168    
2169     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2170     } else {
2171     ## NOTE: This state should never be reached.
2172    
2173     }
2174 wakaba 1.15
2175     $self->{state} = DATA_STATE;
2176     $self->{s_kwd} = '';
2177     ## reconsume
2178     return ($self->{ct}); # end tag
2179     redo A;
2180     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2182     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2183     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2184     ## Reconsume.
2185     return ($self->{ct}); # ATTLIST
2186     redo A;
2187 wakaba 1.1 } else {
2188     die "$0: $self->{ct}->{type}: Unknown token type";
2189     }
2190     } else {
2191     if ({
2192     0x0022 => 1, # "
2193     0x0027 => 1, # '
2194     0x003D => 1, # =
2195 wakaba 1.26 0x003C => 1, # <
2196 wakaba 1.1 }->{$self->{nc}}) {
2197    
2198 wakaba 1.11 ## XML5: Not a parse error.
2199 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2200     } else {
2201    
2202     }
2203     $self->{ca}->{value} .= chr ($self->{nc});
2204     $self->{read_until}->($self->{ca}->{value},
2205 wakaba 1.25 qq["'=& \x09\x0C>],
2206 wakaba 1.1 length $self->{ca}->{value});
2207    
2208     ## Stay in the state
2209    
2210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211     $self->{line_prev} = $self->{line};
2212     $self->{column_prev} = $self->{column};
2213     $self->{column}++;
2214     $self->{nc}
2215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216     } else {
2217     $self->{set_nc}->($self);
2218     }
2219    
2220     redo A;
2221     }
2222     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2223     if ($is_space->{$self->{nc}}) {
2224    
2225     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2226    
2227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2228     $self->{line_prev} = $self->{line};
2229     $self->{column_prev} = $self->{column};
2230     $self->{column}++;
2231     $self->{nc}
2232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2233     } else {
2234     $self->{set_nc}->($self);
2235     }
2236    
2237     redo A;
2238     } elsif ($self->{nc} == 0x003E) { # >
2239     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2240    
2241     $self->{last_stag_name} = $self->{ct}->{tag_name};
2242     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2243     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2244     if ($self->{ct}->{attributes}) {
2245    
2246     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2247     } else {
2248     ## NOTE: This state should never be reached.
2249    
2250     }
2251     } else {
2252     die "$0: $self->{ct}->{type}: Unknown token type";
2253     }
2254     $self->{state} = DATA_STATE;
2255 wakaba 1.5 $self->{s_kwd} = '';
2256 wakaba 1.1
2257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258     $self->{line_prev} = $self->{line};
2259     $self->{column_prev} = $self->{column};
2260     $self->{column}++;
2261     $self->{nc}
2262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263     } else {
2264     $self->{set_nc}->($self);
2265     }
2266    
2267    
2268     return ($self->{ct}); # start tag or end tag
2269    
2270     redo A;
2271     } elsif ($self->{nc} == 0x002F) { # /
2272    
2273     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2274    
2275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2276     $self->{line_prev} = $self->{line};
2277     $self->{column_prev} = $self->{column};
2278     $self->{column}++;
2279     $self->{nc}
2280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2281     } else {
2282     $self->{set_nc}->($self);
2283     }
2284    
2285     redo A;
2286     } elsif ($self->{nc} == -1) {
2287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2288     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2289    
2290     $self->{last_stag_name} = $self->{ct}->{tag_name};
2291     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2292     if ($self->{ct}->{attributes}) {
2293    
2294     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2295     } else {
2296     ## NOTE: This state should never be reached.
2297    
2298     }
2299     } else {
2300     die "$0: $self->{ct}->{type}: Unknown token type";
2301     }
2302     $self->{state} = DATA_STATE;
2303 wakaba 1.5 $self->{s_kwd} = '';
2304 wakaba 1.1 ## Reconsume.
2305     return ($self->{ct}); # start tag or end tag
2306     redo A;
2307     } else {
2308    
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2310     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2311     ## reconsume
2312     redo A;
2313     }
2314     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2315 wakaba 1.11 ## XML5: "Empty tag state".
2316    
2317 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2318     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2319    
2320     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2321     ## TODO: Different type than slash in start tag
2322     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2323     if ($self->{ct}->{attributes}) {
2324    
2325     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2326     } else {
2327    
2328     }
2329     ## TODO: Test |<title></title/>|
2330     } else {
2331    
2332     $self->{self_closing} = 1;
2333     }
2334    
2335     $self->{state} = DATA_STATE;
2336 wakaba 1.5 $self->{s_kwd} = '';
2337 wakaba 1.1
2338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339     $self->{line_prev} = $self->{line};
2340     $self->{column_prev} = $self->{column};
2341     $self->{column}++;
2342     $self->{nc}
2343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344     } else {
2345     $self->{set_nc}->($self);
2346     }
2347    
2348    
2349     return ($self->{ct}); # start tag or end tag
2350    
2351     redo A;
2352     } elsif ($self->{nc} == -1) {
2353     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2354     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2355    
2356     $self->{last_stag_name} = $self->{ct}->{tag_name};
2357     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2358     if ($self->{ct}->{attributes}) {
2359    
2360     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2361     } else {
2362     ## NOTE: This state should never be reached.
2363    
2364     }
2365     } else {
2366     die "$0: $self->{ct}->{type}: Unknown token type";
2367     }
2368 wakaba 1.11 ## XML5: "Tag attribute name before state".
2369 wakaba 1.1 $self->{state} = DATA_STATE;
2370 wakaba 1.5 $self->{s_kwd} = '';
2371 wakaba 1.1 ## Reconsume.
2372     return ($self->{ct}); # start tag or end tag
2373     redo A;
2374     } else {
2375    
2376     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2377     ## TODO: This error type is wrong.
2378     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2379     ## Reconsume.
2380     redo A;
2381     }
2382     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2383 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2384    
2385 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2386     ## consumes characters one-by-one basis.
2387    
2388     if ($self->{nc} == 0x003E) { # >
2389 wakaba 1.13 if ($self->{in_subset}) {
2390    
2391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2392     } else {
2393    
2394     $self->{state} = DATA_STATE;
2395     $self->{s_kwd} = '';
2396     }
2397 wakaba 1.1
2398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2399     $self->{line_prev} = $self->{line};
2400     $self->{column_prev} = $self->{column};
2401     $self->{column}++;
2402     $self->{nc}
2403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2404     } else {
2405     $self->{set_nc}->($self);
2406     }
2407    
2408    
2409     return ($self->{ct}); # comment
2410     redo A;
2411     } elsif ($self->{nc} == -1) {
2412 wakaba 1.13 if ($self->{in_subset}) {
2413    
2414     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2415     } else {
2416    
2417     $self->{state} = DATA_STATE;
2418     $self->{s_kwd} = '';
2419     }
2420 wakaba 1.1 ## reconsume
2421    
2422     return ($self->{ct}); # comment
2423     redo A;
2424     } else {
2425    
2426     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2427     $self->{read_until}->($self->{ct}->{data},
2428     q[>],
2429     length $self->{ct}->{data});
2430    
2431     ## Stay in the state.
2432    
2433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434     $self->{line_prev} = $self->{line};
2435     $self->{column_prev} = $self->{column};
2436     $self->{column}++;
2437     $self->{nc}
2438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439     } else {
2440     $self->{set_nc}->($self);
2441     }
2442    
2443     redo A;
2444     }
2445     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2446 wakaba 1.14 ## XML5: "Markup declaration state".
2447 wakaba 1.1
2448     if ($self->{nc} == 0x002D) { # -
2449    
2450     $self->{state} = MD_HYPHEN_STATE;
2451    
2452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2453     $self->{line_prev} = $self->{line};
2454     $self->{column_prev} = $self->{column};
2455     $self->{column}++;
2456     $self->{nc}
2457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2458     } else {
2459     $self->{set_nc}->($self);
2460     }
2461    
2462     redo A;
2463     } elsif ($self->{nc} == 0x0044 or # D
2464     $self->{nc} == 0x0064) { # d
2465     ## ASCII case-insensitive.
2466    
2467     $self->{state} = MD_DOCTYPE_STATE;
2468 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2469 wakaba 1.1
2470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2471     $self->{line_prev} = $self->{line};
2472     $self->{column_prev} = $self->{column};
2473     $self->{column}++;
2474     $self->{nc}
2475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2476     } else {
2477     $self->{set_nc}->($self);
2478     }
2479    
2480     redo A;
2481 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2482     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2483     $self->{is_xml}) and
2484 wakaba 1.1 $self->{nc} == 0x005B) { # [
2485    
2486     $self->{state} = MD_CDATA_STATE;
2487 wakaba 1.12 $self->{kwd} = '[';
2488 wakaba 1.1
2489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2490     $self->{line_prev} = $self->{line};
2491     $self->{column_prev} = $self->{column};
2492     $self->{column}++;
2493     $self->{nc}
2494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2495     } else {
2496     $self->{set_nc}->($self);
2497     }
2498    
2499     redo A;
2500     } else {
2501    
2502     }
2503    
2504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2505     line => $self->{line_prev},
2506     column => $self->{column_prev} - 1);
2507     ## Reconsume.
2508     $self->{state} = BOGUS_COMMENT_STATE;
2509     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2510     line => $self->{line_prev},
2511     column => $self->{column_prev} - 1,
2512     };
2513     redo A;
2514     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2515     if ($self->{nc} == 0x002D) { # -
2516    
2517     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2518     line => $self->{line_prev},
2519     column => $self->{column_prev} - 2,
2520     };
2521 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2522 wakaba 1.1
2523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2524     $self->{line_prev} = $self->{line};
2525     $self->{column_prev} = $self->{column};
2526     $self->{column}++;
2527     $self->{nc}
2528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2529     } else {
2530     $self->{set_nc}->($self);
2531     }
2532    
2533     redo A;
2534     } else {
2535    
2536     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2537     line => $self->{line_prev},
2538     column => $self->{column_prev} - 2);
2539     $self->{state} = BOGUS_COMMENT_STATE;
2540     ## Reconsume.
2541     $self->{ct} = {type => COMMENT_TOKEN,
2542     data => '-',
2543     line => $self->{line_prev},
2544     column => $self->{column_prev} - 2,
2545     };
2546     redo A;
2547     }
2548     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2549     ## ASCII case-insensitive.
2550     if ($self->{nc} == [
2551     undef,
2552     0x004F, # O
2553     0x0043, # C
2554     0x0054, # T
2555     0x0059, # Y
2556     0x0050, # P
2557 wakaba 1.12 ]->[length $self->{kwd}] or
2558 wakaba 1.1 $self->{nc} == [
2559     undef,
2560     0x006F, # o
2561     0x0063, # c
2562     0x0074, # t
2563     0x0079, # y
2564     0x0070, # p
2565 wakaba 1.12 ]->[length $self->{kwd}]) {
2566 wakaba 1.1
2567     ## Stay in the state.
2568 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2569 wakaba 1.1
2570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2571     $self->{line_prev} = $self->{line};
2572     $self->{column_prev} = $self->{column};
2573     $self->{column}++;
2574     $self->{nc}
2575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2576     } else {
2577     $self->{set_nc}->($self);
2578     }
2579    
2580     redo A;
2581 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2582 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2583     $self->{nc} == 0x0065)) { # e
2584 wakaba 1.12 if ($self->{is_xml} and
2585     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2586 wakaba 1.10
2587     ## XML5: case-sensitive.
2588     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2589     text => 'DOCTYPE',
2590     line => $self->{line_prev},
2591     column => $self->{column_prev} - 5);
2592     } else {
2593    
2594     }
2595 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2596     $self->{ct} = {type => DOCTYPE_TOKEN,
2597     quirks => 1,
2598     line => $self->{line_prev},
2599     column => $self->{column_prev} - 7,
2600     };
2601    
2602     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2603     $self->{line_prev} = $self->{line};
2604     $self->{column_prev} = $self->{column};
2605     $self->{column}++;
2606     $self->{nc}
2607     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2608     } else {
2609     $self->{set_nc}->($self);
2610     }
2611    
2612     redo A;
2613     } else {
2614    
2615     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2616     line => $self->{line_prev},
2617 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2618 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2619     ## Reconsume.
2620     $self->{ct} = {type => COMMENT_TOKEN,
2621 wakaba 1.12 data => $self->{kwd},
2622 wakaba 1.1 line => $self->{line_prev},
2623 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2624 wakaba 1.1 };
2625     redo A;
2626     }
2627     } elsif ($self->{state} == MD_CDATA_STATE) {
2628     if ($self->{nc} == {
2629     '[' => 0x0043, # C
2630     '[C' => 0x0044, # D
2631     '[CD' => 0x0041, # A
2632     '[CDA' => 0x0054, # T
2633     '[CDAT' => 0x0041, # A
2634 wakaba 1.12 }->{$self->{kwd}}) {
2635 wakaba 1.1
2636     ## Stay in the state.
2637 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2638 wakaba 1.1
2639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640     $self->{line_prev} = $self->{line};
2641     $self->{column_prev} = $self->{column};
2642     $self->{column}++;
2643     $self->{nc}
2644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2645     } else {
2646     $self->{set_nc}->($self);
2647     }
2648    
2649     redo A;
2650 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2651 wakaba 1.1 $self->{nc} == 0x005B) { # [
2652 wakaba 1.6 if ($self->{is_xml} and
2653     not $self->{tainted} and
2654     @{$self->{open_elements} or []} == 0) {
2655 wakaba 1.8
2656 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2657     line => $self->{line_prev},
2658     column => $self->{column_prev} - 7);
2659     $self->{tainted} = 1;
2660 wakaba 1.8 } else {
2661    
2662 wakaba 1.6 }
2663    
2664 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2665     data => '',
2666     line => $self->{line_prev},
2667     column => $self->{column_prev} - 7};
2668     $self->{state} = CDATA_SECTION_STATE;
2669    
2670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2671     $self->{line_prev} = $self->{line};
2672     $self->{column_prev} = $self->{column};
2673     $self->{column}++;
2674     $self->{nc}
2675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2676     } else {
2677     $self->{set_nc}->($self);
2678     }
2679    
2680     redo A;
2681     } else {
2682    
2683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2684     line => $self->{line_prev},
2685 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2686 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2687     ## Reconsume.
2688     $self->{ct} = {type => COMMENT_TOKEN,
2689 wakaba 1.12 data => $self->{kwd},
2690 wakaba 1.1 line => $self->{line_prev},
2691 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2692 wakaba 1.1 };
2693     redo A;
2694     }
2695     } elsif ($self->{state} == COMMENT_START_STATE) {
2696     if ($self->{nc} == 0x002D) { # -
2697    
2698     $self->{state} = COMMENT_START_DASH_STATE;
2699    
2700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2701     $self->{line_prev} = $self->{line};
2702     $self->{column_prev} = $self->{column};
2703     $self->{column}++;
2704     $self->{nc}
2705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2706     } else {
2707     $self->{set_nc}->($self);
2708     }
2709    
2710     redo A;
2711     } elsif ($self->{nc} == 0x003E) { # >
2712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2713 wakaba 1.13 if ($self->{in_subset}) {
2714    
2715     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2716     } else {
2717    
2718     $self->{state} = DATA_STATE;
2719     $self->{s_kwd} = '';
2720     }
2721 wakaba 1.1
2722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2723     $self->{line_prev} = $self->{line};
2724     $self->{column_prev} = $self->{column};
2725     $self->{column}++;
2726     $self->{nc}
2727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2728     } else {
2729     $self->{set_nc}->($self);
2730     }
2731    
2732    
2733     return ($self->{ct}); # comment
2734    
2735     redo A;
2736     } elsif ($self->{nc} == -1) {
2737     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2738 wakaba 1.13 if ($self->{in_subset}) {
2739    
2740     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741     } else {
2742    
2743     $self->{state} = DATA_STATE;
2744     $self->{s_kwd} = '';
2745     }
2746 wakaba 1.1 ## reconsume
2747    
2748     return ($self->{ct}); # comment
2749    
2750     redo A;
2751     } else {
2752    
2753     $self->{ct}->{data} # comment
2754     .= chr ($self->{nc});
2755     $self->{state} = COMMENT_STATE;
2756    
2757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2758     $self->{line_prev} = $self->{line};
2759     $self->{column_prev} = $self->{column};
2760     $self->{column}++;
2761     $self->{nc}
2762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2763     } else {
2764     $self->{set_nc}->($self);
2765     }
2766    
2767     redo A;
2768     }
2769     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2770     if ($self->{nc} == 0x002D) { # -
2771    
2772     $self->{state} = COMMENT_END_STATE;
2773    
2774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2775     $self->{line_prev} = $self->{line};
2776     $self->{column_prev} = $self->{column};
2777     $self->{column}++;
2778     $self->{nc}
2779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2780     } else {
2781     $self->{set_nc}->($self);
2782     }
2783    
2784     redo A;
2785     } elsif ($self->{nc} == 0x003E) { # >
2786     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2787 wakaba 1.13 if ($self->{in_subset}) {
2788    
2789     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2790     } else {
2791    
2792     $self->{state} = DATA_STATE;
2793     $self->{s_kwd} = '';
2794     }
2795 wakaba 1.1
2796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2797     $self->{line_prev} = $self->{line};
2798     $self->{column_prev} = $self->{column};
2799     $self->{column}++;
2800     $self->{nc}
2801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2802     } else {
2803     $self->{set_nc}->($self);
2804     }
2805    
2806    
2807     return ($self->{ct}); # comment
2808    
2809     redo A;
2810     } elsif ($self->{nc} == -1) {
2811     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2812 wakaba 1.13 if ($self->{in_subset}) {
2813    
2814     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2815     } else {
2816    
2817     $self->{state} = DATA_STATE;
2818     $self->{s_kwd} = '';
2819     }
2820 wakaba 1.1 ## reconsume
2821    
2822     return ($self->{ct}); # comment
2823    
2824     redo A;
2825     } else {
2826    
2827     $self->{ct}->{data} # comment
2828     .= '-' . chr ($self->{nc});
2829     $self->{state} = COMMENT_STATE;
2830    
2831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832     $self->{line_prev} = $self->{line};
2833     $self->{column_prev} = $self->{column};
2834     $self->{column}++;
2835     $self->{nc}
2836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837     } else {
2838     $self->{set_nc}->($self);
2839     }
2840    
2841     redo A;
2842     }
2843     } elsif ($self->{state} == COMMENT_STATE) {
2844 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2845    
2846 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2847    
2848     $self->{state} = COMMENT_END_DASH_STATE;
2849    
2850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851     $self->{line_prev} = $self->{line};
2852     $self->{column_prev} = $self->{column};
2853     $self->{column}++;
2854     $self->{nc}
2855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856     } else {
2857     $self->{set_nc}->($self);
2858     }
2859    
2860     redo A;
2861     } elsif ($self->{nc} == -1) {
2862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863 wakaba 1.13 if ($self->{in_subset}) {
2864    
2865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866     } else {
2867    
2868     $self->{state} = DATA_STATE;
2869     $self->{s_kwd} = '';
2870     }
2871 wakaba 1.1 ## reconsume
2872    
2873     return ($self->{ct}); # comment
2874    
2875     redo A;
2876     } else {
2877    
2878     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2879     $self->{read_until}->($self->{ct}->{data},
2880     q[-],
2881     length $self->{ct}->{data});
2882    
2883     ## Stay in the state
2884    
2885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2886     $self->{line_prev} = $self->{line};
2887     $self->{column_prev} = $self->{column};
2888     $self->{column}++;
2889     $self->{nc}
2890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2891     } else {
2892     $self->{set_nc}->($self);
2893     }
2894    
2895     redo A;
2896     }
2897     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2898 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2899 wakaba 1.10
2900 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2901    
2902     $self->{state} = COMMENT_END_STATE;
2903    
2904     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2905     $self->{line_prev} = $self->{line};
2906     $self->{column_prev} = $self->{column};
2907     $self->{column}++;
2908     $self->{nc}
2909     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2910     } else {
2911     $self->{set_nc}->($self);
2912     }
2913    
2914     redo A;
2915     } elsif ($self->{nc} == -1) {
2916     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2917 wakaba 1.13 if ($self->{in_subset}) {
2918    
2919     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920     } else {
2921    
2922     $self->{state} = DATA_STATE;
2923     $self->{s_kwd} = '';
2924     }
2925 wakaba 1.1 ## reconsume
2926    
2927     return ($self->{ct}); # comment
2928    
2929     redo A;
2930     } else {
2931    
2932     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2933     $self->{state} = COMMENT_STATE;
2934    
2935     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2936     $self->{line_prev} = $self->{line};
2937     $self->{column_prev} = $self->{column};
2938     $self->{column}++;
2939     $self->{nc}
2940     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2941     } else {
2942     $self->{set_nc}->($self);
2943     }
2944    
2945     redo A;
2946     }
2947 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2948     $self->{state} == COMMENT_END_BANG_STATE) {
2949 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2950 wakaba 1.31 ## (No comment end bang state.)
2951 wakaba 1.14
2952 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2953 wakaba 1.13 if ($self->{in_subset}) {
2954    
2955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956     } else {
2957    
2958     $self->{state} = DATA_STATE;
2959     $self->{s_kwd} = '';
2960     }
2961 wakaba 1.1
2962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2963     $self->{line_prev} = $self->{line};
2964     $self->{column_prev} = $self->{column};
2965     $self->{column}++;
2966     $self->{nc}
2967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2968     } else {
2969     $self->{set_nc}->($self);
2970     }
2971    
2972    
2973     return ($self->{ct}); # comment
2974    
2975     redo A;
2976     } elsif ($self->{nc} == 0x002D) { # -
2977 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2978    
2979     $self->{ct}->{data} .= '--!'; # comment
2980     $self->{state} = COMMENT_END_DASH_STATE;
2981     } else {
2982    
2983     ## XML5: Not a parse error.
2984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2985     line => $self->{line_prev},
2986     column => $self->{column_prev});
2987     $self->{ct}->{data} .= '-'; # comment
2988     ## Stay in the state
2989     }
2990 wakaba 1.1
2991 wakaba 1.31 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3003     $is_space->{$self->{nc}}) {
3004    
3005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3006     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3007     $self->{state} = COMMENT_END_SPACE_STATE;
3008    
3009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010     $self->{line_prev} = $self->{line};
3011     $self->{column_prev} = $self->{column};
3012     $self->{column}++;
3013     $self->{nc}
3014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3015     } else {
3016     $self->{set_nc}->($self);
3017     }
3018    
3019     redo A;
3020     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3021     $self->{nc} == 0x0021) { # !
3022    
3023 wakaba 1.31 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3024     $self->{state} = COMMENT_END_BANG_STATE;
3025 wakaba 1.1
3026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027     $self->{line_prev} = $self->{line};
3028     $self->{column_prev} = $self->{column};
3029     $self->{column}++;
3030     $self->{nc}
3031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032     } else {
3033     $self->{set_nc}->($self);
3034     }
3035    
3036     redo A;
3037     } elsif ($self->{nc} == -1) {
3038     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3039 wakaba 1.13 if ($self->{in_subset}) {
3040    
3041     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3042     } else {
3043    
3044     $self->{state} = DATA_STATE;
3045     $self->{s_kwd} = '';
3046     }
3047 wakaba 1.31 ## Reconsume.
3048 wakaba 1.1
3049     return ($self->{ct}); # comment
3050    
3051     redo A;
3052     } else {
3053    
3054 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3055     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3056     } else {
3057     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3058     }
3059 wakaba 1.1 $self->{state} = COMMENT_STATE;
3060    
3061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062     $self->{line_prev} = $self->{line};
3063     $self->{column_prev} = $self->{column};
3064     $self->{column}++;
3065     $self->{nc}
3066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067     } else {
3068     $self->{set_nc}->($self);
3069     }
3070    
3071     redo A;
3072     }
3073 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3074     ## XML5: Not exist.
3075    
3076     if ($self->{nc} == 0x003E) { # >
3077     if ($self->{in_subset}) {
3078    
3079     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080     } else {
3081    
3082     $self->{state} = DATA_STATE;
3083     $self->{s_kwd} = '';
3084     }
3085    
3086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3087     $self->{line_prev} = $self->{line};
3088     $self->{column_prev} = $self->{column};
3089     $self->{column}++;
3090     $self->{nc}
3091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3092     } else {
3093     $self->{set_nc}->($self);
3094     }
3095    
3096    
3097     return ($self->{ct}); # comment
3098    
3099     redo A;
3100     } elsif ($is_space->{$self->{nc}}) {
3101    
3102     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3103     ## Stay in the state.
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == -1) {
3117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3118     if ($self->{in_subset}) {
3119    
3120     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3121     } else {
3122    
3123     $self->{state} = DATA_STATE;
3124     $self->{s_kwd} = '';
3125     }
3126     ## Reconsume.
3127    
3128     return ($self->{ct}); # comment
3129    
3130     redo A;
3131     } else {
3132    
3133     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3134     $self->{state} = COMMENT_STATE;
3135    
3136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137     $self->{line_prev} = $self->{line};
3138     $self->{column_prev} = $self->{column};
3139     $self->{column}++;
3140     $self->{nc}
3141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142     } else {
3143     $self->{set_nc}->($self);
3144     }
3145    
3146     redo A;
3147     }
3148 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
3149     if ($is_space->{$self->{nc}}) {
3150    
3151     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3152    
3153     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3154     $self->{line_prev} = $self->{line};
3155     $self->{column_prev} = $self->{column};
3156     $self->{column}++;
3157     $self->{nc}
3158     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3159     } else {
3160     $self->{set_nc}->($self);
3161     }
3162    
3163     redo A;
3164 wakaba 1.28 } elsif ($self->{nc} == -1) {
3165    
3166     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3167     $self->{ct}->{quirks} = 1;
3168    
3169     $self->{state} = DATA_STATE;
3170     ## Reconsume.
3171     return ($self->{ct}); # DOCTYPE (quirks)
3172    
3173     redo A;
3174 wakaba 1.1 } else {
3175    
3176 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3177 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3178     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3179     ## reconsume
3180     redo A;
3181     }
3182     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3183 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3184    
3185 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3186    
3187     ## Stay in the state
3188    
3189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3190     $self->{line_prev} = $self->{line};
3191     $self->{column_prev} = $self->{column};
3192     $self->{column}++;
3193     $self->{nc}
3194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3195     } else {
3196     $self->{set_nc}->($self);
3197     }
3198    
3199     redo A;
3200     } elsif ($self->{nc} == 0x003E) { # >
3201    
3202 wakaba 1.12 ## XML5: No parse error.
3203 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3204     $self->{state} = DATA_STATE;
3205 wakaba 1.5 $self->{s_kwd} = '';
3206 wakaba 1.1
3207     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3208     $self->{line_prev} = $self->{line};
3209     $self->{column_prev} = $self->{column};
3210     $self->{column}++;
3211     $self->{nc}
3212     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3213     } else {
3214     $self->{set_nc}->($self);
3215     }
3216    
3217    
3218     return ($self->{ct}); # DOCTYPE (quirks)
3219    
3220     redo A;
3221 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3222    
3223     $self->{ct}->{name} # DOCTYPE
3224     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3225     delete $self->{ct}->{quirks};
3226     $self->{state} = DOCTYPE_NAME_STATE;
3227    
3228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229     $self->{line_prev} = $self->{line};
3230     $self->{column_prev} = $self->{column};
3231     $self->{column}++;
3232     $self->{nc}
3233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234     } else {
3235     $self->{set_nc}->($self);
3236     }
3237    
3238     redo A;
3239 wakaba 1.1 } elsif ($self->{nc} == -1) {
3240    
3241     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3242     $self->{state} = DATA_STATE;
3243 wakaba 1.5 $self->{s_kwd} = '';
3244 wakaba 1.1 ## reconsume
3245    
3246     return ($self->{ct}); # DOCTYPE (quirks)
3247    
3248     redo A;
3249 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3250    
3251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3253 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3254     $self->{in_subset} = 1;
3255 wakaba 1.12
3256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3257     $self->{line_prev} = $self->{line};
3258     $self->{column_prev} = $self->{column};
3259     $self->{column}++;
3260     $self->{nc}
3261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3262     } else {
3263     $self->{set_nc}->($self);
3264     }
3265    
3266 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3267 wakaba 1.12 redo A;
3268 wakaba 1.1 } else {
3269    
3270     $self->{ct}->{name} = chr $self->{nc};
3271     delete $self->{ct}->{quirks};
3272     $self->{state} = DOCTYPE_NAME_STATE;
3273    
3274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275     $self->{line_prev} = $self->{line};
3276     $self->{column_prev} = $self->{column};
3277     $self->{column}++;
3278     $self->{nc}
3279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280     } else {
3281     $self->{set_nc}->($self);
3282     }
3283    
3284     redo A;
3285     }
3286     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3287 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3288    
3289     ## ISSUE: Redundant "First," in the spec.
3290    
3291 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3292    
3293     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3294    
3295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3296     $self->{line_prev} = $self->{line};
3297     $self->{column_prev} = $self->{column};
3298     $self->{column}++;
3299     $self->{nc}
3300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3301     } else {
3302     $self->{set_nc}->($self);
3303     }
3304    
3305     redo A;
3306     } elsif ($self->{nc} == 0x003E) { # >
3307    
3308     $self->{state} = DATA_STATE;
3309 wakaba 1.5 $self->{s_kwd} = '';
3310 wakaba 1.1
3311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312     $self->{line_prev} = $self->{line};
3313     $self->{column_prev} = $self->{column};
3314     $self->{column}++;
3315     $self->{nc}
3316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3317     } else {
3318     $self->{set_nc}->($self);
3319     }
3320    
3321    
3322     return ($self->{ct}); # DOCTYPE
3323    
3324     redo A;
3325 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3326    
3327     $self->{ct}->{name} # DOCTYPE
3328     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3329     delete $self->{ct}->{quirks};
3330     ## Stay in the state.
3331    
3332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3333     $self->{line_prev} = $self->{line};
3334     $self->{column_prev} = $self->{column};
3335     $self->{column}++;
3336     $self->{nc}
3337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3338     } else {
3339     $self->{set_nc}->($self);
3340     }
3341    
3342     redo A;
3343 wakaba 1.1 } elsif ($self->{nc} == -1) {
3344    
3345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3346     $self->{state} = DATA_STATE;
3347 wakaba 1.5 $self->{s_kwd} = '';
3348 wakaba 1.1 ## reconsume
3349    
3350     $self->{ct}->{quirks} = 1;
3351     return ($self->{ct}); # DOCTYPE
3352    
3353     redo A;
3354 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3355    
3356     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3357 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3358     $self->{in_subset} = 1;
3359 wakaba 1.12
3360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361     $self->{line_prev} = $self->{line};
3362     $self->{column_prev} = $self->{column};
3363     $self->{column}++;
3364     $self->{nc}
3365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366     } else {
3367     $self->{set_nc}->($self);
3368     }
3369    
3370 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3371 wakaba 1.12 redo A;
3372 wakaba 1.1 } else {
3373    
3374 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3375     ## Stay in the state.
3376 wakaba 1.1
3377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3378     $self->{line_prev} = $self->{line};
3379     $self->{column_prev} = $self->{column};
3380     $self->{column}++;
3381     $self->{nc}
3382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3383     } else {
3384     $self->{set_nc}->($self);
3385     }
3386    
3387     redo A;
3388     }
3389     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3390 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3391     ## state", but implemented differently.
3392    
3393 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3394    
3395     ## Stay in the state
3396    
3397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3398     $self->{line_prev} = $self->{line};
3399     $self->{column_prev} = $self->{column};
3400     $self->{column}++;
3401     $self->{nc}
3402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3403     } else {
3404     $self->{set_nc}->($self);
3405     }
3406    
3407     redo A;
3408     } elsif ($self->{nc} == 0x003E) { # >
3409 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3410    
3411     $self->{state} = DATA_STATE;
3412     $self->{s_kwd} = '';
3413     } else {
3414    
3415     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3416     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3417     }
3418 wakaba 1.1
3419    
3420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3421     $self->{line_prev} = $self->{line};
3422     $self->{column_prev} = $self->{column};
3423     $self->{column}++;
3424     $self->{nc}
3425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3426     } else {
3427     $self->{set_nc}->($self);
3428     }
3429    
3430 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3431 wakaba 1.1 redo A;
3432     } elsif ($self->{nc} == -1) {
3433 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3434    
3435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3436     $self->{state} = DATA_STATE;
3437     $self->{s_kwd} = '';
3438     $self->{ct}->{quirks} = 1;
3439     } else {
3440    
3441     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3442     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3443     }
3444 wakaba 1.1
3445 wakaba 1.16 ## Reconsume.
3446     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3447 wakaba 1.1 redo A;
3448     } elsif ($self->{nc} == 0x0050 or # P
3449     $self->{nc} == 0x0070) { # p
3450 wakaba 1.12
3451 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3452 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3453 wakaba 1.1
3454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455     $self->{line_prev} = $self->{line};
3456     $self->{column_prev} = $self->{column};
3457     $self->{column}++;
3458     $self->{nc}
3459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3460     } else {
3461     $self->{set_nc}->($self);
3462     }
3463    
3464     redo A;
3465     } elsif ($self->{nc} == 0x0053 or # S
3466     $self->{nc} == 0x0073) { # s
3467 wakaba 1.12
3468 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3469 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3470    
3471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3472     $self->{line_prev} = $self->{line};
3473     $self->{column_prev} = $self->{column};
3474     $self->{column}++;
3475     $self->{nc}
3476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3477     } else {
3478     $self->{set_nc}->($self);
3479     }
3480    
3481     redo A;
3482 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3483     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3484     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3485    
3486     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3487     $self->{ct}->{value} = ''; # ENTITY
3488    
3489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490     $self->{line_prev} = $self->{line};
3491     $self->{column_prev} = $self->{column};
3492     $self->{column}++;
3493     $self->{nc}
3494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495     } else {
3496     $self->{set_nc}->($self);
3497     }
3498    
3499     redo A;
3500     } elsif ($self->{nc} == 0x0027 and # '
3501     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3502     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3503    
3504     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3505     $self->{ct}->{value} = ''; # ENTITY
3506    
3507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508     $self->{line_prev} = $self->{line};
3509     $self->{column_prev} = $self->{column};
3510     $self->{column}++;
3511     $self->{nc}
3512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3513     } else {
3514     $self->{set_nc}->($self);
3515     }
3516    
3517     redo A;
3518 wakaba 1.16 } elsif ($self->{is_xml} and
3519     $self->{ct}->{type} == DOCTYPE_TOKEN and
3520     $self->{nc} == 0x005B) { # [
3521 wakaba 1.12
3522     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3523     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3524 wakaba 1.13 $self->{in_subset} = 1;
3525 wakaba 1.1
3526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527     $self->{line_prev} = $self->{line};
3528     $self->{column_prev} = $self->{column};
3529     $self->{column}++;
3530     $self->{nc}
3531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3532     } else {
3533     $self->{set_nc}->($self);
3534     }
3535    
3536 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3537 wakaba 1.1 redo A;
3538     } else {
3539 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3540    
3541     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3542    
3543     $self->{ct}->{quirks} = 1;
3544     $self->{state} = BOGUS_DOCTYPE_STATE;
3545     } else {
3546    
3547     $self->{state} = BOGUS_MD_STATE;
3548     }
3549 wakaba 1.1
3550    
3551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3552     $self->{line_prev} = $self->{line};
3553     $self->{column_prev} = $self->{column};
3554     $self->{column}++;
3555     $self->{nc}
3556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3557     } else {
3558     $self->{set_nc}->($self);
3559     }
3560    
3561     redo A;
3562     }
3563     } elsif ($self->{state} == PUBLIC_STATE) {
3564     ## ASCII case-insensitive
3565     if ($self->{nc} == [
3566     undef,
3567     0x0055, # U
3568     0x0042, # B
3569     0x004C, # L
3570     0x0049, # I
3571 wakaba 1.12 ]->[length $self->{kwd}] or
3572 wakaba 1.1 $self->{nc} == [
3573     undef,
3574     0x0075, # u
3575     0x0062, # b
3576     0x006C, # l
3577     0x0069, # i
3578 wakaba 1.12 ]->[length $self->{kwd}]) {
3579 wakaba 1.1
3580     ## Stay in the state.
3581 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3582 wakaba 1.1
3583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3584     $self->{line_prev} = $self->{line};
3585     $self->{column_prev} = $self->{column};
3586     $self->{column}++;
3587     $self->{nc}
3588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3589     } else {
3590     $self->{set_nc}->($self);
3591     }
3592    
3593     redo A;
3594 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3595 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3596     $self->{nc} == 0x0063)) { # c
3597 wakaba 1.12 if ($self->{is_xml} and
3598     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3599    
3600     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3601     text => 'PUBLIC',
3602     line => $self->{line_prev},
3603     column => $self->{column_prev} - 4);
3604     } else {
3605    
3606     }
3607 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3608    
3609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3610     $self->{line_prev} = $self->{line};
3611     $self->{column_prev} = $self->{column};
3612     $self->{column}++;
3613     $self->{nc}
3614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3615     } else {
3616     $self->{set_nc}->($self);
3617     }
3618    
3619     redo A;
3620     } else {
3621 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3622 wakaba 1.1 line => $self->{line_prev},
3623 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3624 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3625    
3626     $self->{ct}->{quirks} = 1;
3627     $self->{state} = BOGUS_DOCTYPE_STATE;
3628     } else {
3629    
3630     $self->{state} = BOGUS_MD_STATE;
3631     }
3632 wakaba 1.1 ## Reconsume.
3633     redo A;
3634     }
3635     } elsif ($self->{state} == SYSTEM_STATE) {
3636     ## ASCII case-insensitive
3637     if ($self->{nc} == [
3638     undef,
3639     0x0059, # Y
3640     0x0053, # S
3641     0x0054, # T
3642     0x0045, # E
3643 wakaba 1.12 ]->[length $self->{kwd}] or
3644 wakaba 1.1 $self->{nc} == [
3645     undef,
3646     0x0079, # y
3647     0x0073, # s
3648     0x0074, # t
3649     0x0065, # e
3650 wakaba 1.12 ]->[length $self->{kwd}]) {
3651 wakaba 1.1
3652     ## Stay in the state.
3653 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3654 wakaba 1.1
3655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3656     $self->{line_prev} = $self->{line};
3657     $self->{column_prev} = $self->{column};
3658     $self->{column}++;
3659     $self->{nc}
3660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3661     } else {
3662     $self->{set_nc}->($self);
3663     }
3664    
3665     redo A;
3666 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3667 wakaba 1.1 ($self->{nc} == 0x004D or # M
3668     $self->{nc} == 0x006D)) { # m
3669 wakaba 1.12 if ($self->{is_xml} and
3670     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3671    
3672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3673     text => 'SYSTEM',
3674     line => $self->{line_prev},
3675     column => $self->{column_prev} - 4);
3676     } else {
3677    
3678     }
3679 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3680    
3681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3682     $self->{line_prev} = $self->{line};
3683     $self->{column_prev} = $self->{column};
3684     $self->{column}++;
3685     $self->{nc}
3686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3687     } else {
3688     $self->{set_nc}->($self);
3689     }
3690    
3691     redo A;
3692     } else {
3693 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3694 wakaba 1.1 line => $self->{line_prev},
3695 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3696 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3697    
3698     $self->{ct}->{quirks} = 1;
3699     $self->{state} = BOGUS_DOCTYPE_STATE;
3700     } else {
3701    
3702     $self->{state} = BOGUS_MD_STATE;
3703     }
3704 wakaba 1.1 ## Reconsume.
3705     redo A;
3706     }
3707     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3708     if ($is_space->{$self->{nc}}) {
3709    
3710     ## Stay in the state
3711    
3712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3713     $self->{line_prev} = $self->{line};
3714     $self->{column_prev} = $self->{column};
3715     $self->{column}++;
3716     $self->{nc}
3717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3718     } else {
3719     $self->{set_nc}->($self);
3720     }
3721    
3722     redo A;
3723     } elsif ($self->{nc} eq 0x0022) { # "
3724    
3725     $self->{ct}->{pubid} = ''; # DOCTYPE
3726     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3727    
3728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3729     $self->{line_prev} = $self->{line};
3730     $self->{column_prev} = $self->{column};
3731     $self->{column}++;
3732     $self->{nc}
3733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3734     } else {
3735     $self->{set_nc}->($self);
3736     }
3737    
3738     redo A;
3739     } elsif ($self->{nc} eq 0x0027) { # '
3740    
3741     $self->{ct}->{pubid} = ''; # DOCTYPE
3742     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3743    
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754     redo A;
3755     } elsif ($self->{nc} eq 0x003E) { # >
3756 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3757    
3758     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3759    
3760     $self->{state} = DATA_STATE;
3761     $self->{s_kwd} = '';
3762     $self->{ct}->{quirks} = 1;
3763     } else {
3764    
3765     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3766     }
3767 wakaba 1.1
3768    
3769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3770     $self->{line_prev} = $self->{line};
3771     $self->{column_prev} = $self->{column};
3772     $self->{column}++;
3773     $self->{nc}
3774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3775     } else {
3776     $self->{set_nc}->($self);
3777     }
3778    
3779 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3780 wakaba 1.1 redo A;
3781     } elsif ($self->{nc} == -1) {
3782 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3783    
3784     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3785     $self->{state} = DATA_STATE;
3786     $self->{s_kwd} = '';
3787     $self->{ct}->{quirks} = 1;
3788     } else {
3789    
3790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3791     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3792     }
3793 wakaba 1.1
3794     ## reconsume
3795     return ($self->{ct}); # DOCTYPE
3796     redo A;
3797 wakaba 1.16 } elsif ($self->{is_xml} and
3798     $self->{ct}->{type} == DOCTYPE_TOKEN and
3799     $self->{nc} == 0x005B) { # [
3800 wakaba 1.12
3801     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3802     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3803     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3804 wakaba 1.13 $self->{in_subset} = 1;
3805 wakaba 1.12
3806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807     $self->{line_prev} = $self->{line};
3808     $self->{column_prev} = $self->{column};
3809     $self->{column}++;
3810     $self->{nc}
3811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812     } else {
3813     $self->{set_nc}->($self);
3814     }
3815    
3816 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3817 wakaba 1.12 redo A;
3818 wakaba 1.1 } else {
3819     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3820    
3821 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3822    
3823     $self->{ct}->{quirks} = 1;
3824     $self->{state} = BOGUS_DOCTYPE_STATE;
3825     } else {
3826    
3827     $self->{state} = BOGUS_MD_STATE;
3828     }
3829    
3830 wakaba 1.1
3831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832     $self->{line_prev} = $self->{line};
3833     $self->{column_prev} = $self->{column};
3834     $self->{column}++;
3835     $self->{nc}
3836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837     } else {
3838     $self->{set_nc}->($self);
3839     }
3840    
3841     redo A;
3842     }
3843     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3844     if ($self->{nc} == 0x0022) { # "
3845    
3846     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3847    
3848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849     $self->{line_prev} = $self->{line};
3850     $self->{column_prev} = $self->{column};
3851     $self->{column}++;
3852     $self->{nc}
3853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854     } else {
3855     $self->{set_nc}->($self);
3856     }
3857    
3858     redo A;
3859     } elsif ($self->{nc} == 0x003E) { # >
3860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3861    
3862 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863    
3864     $self->{state} = DATA_STATE;
3865     $self->{s_kwd} = '';
3866     $self->{ct}->{quirks} = 1;
3867     } else {
3868    
3869     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3870     }
3871    
3872 wakaba 1.1
3873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3874     $self->{line_prev} = $self->{line};
3875     $self->{column_prev} = $self->{column};
3876     $self->{column}++;
3877     $self->{nc}
3878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3879     } else {
3880     $self->{set_nc}->($self);
3881     }
3882    
3883 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3884 wakaba 1.1 redo A;
3885     } elsif ($self->{nc} == -1) {
3886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3887    
3888 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3889    
3890     $self->{state} = DATA_STATE;
3891     $self->{s_kwd} = '';
3892     $self->{ct}->{quirks} = 1;
3893     } else {
3894    
3895     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3896     }
3897    
3898     ## Reconsume.
3899 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3900     redo A;
3901     } else {
3902    
3903 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3904 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3905     length $self->{ct}->{pubid});
3906    
3907     ## Stay in the state
3908    
3909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910     $self->{line_prev} = $self->{line};
3911     $self->{column_prev} = $self->{column};
3912     $self->{column}++;
3913     $self->{nc}
3914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915     } else {
3916     $self->{set_nc}->($self);
3917     }
3918    
3919     redo A;
3920     }
3921     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3922     if ($self->{nc} == 0x0027) { # '
3923    
3924     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3925    
3926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3927     $self->{line_prev} = $self->{line};
3928     $self->{column_prev} = $self->{column};
3929     $self->{column}++;
3930     $self->{nc}
3931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3932     } else {
3933     $self->{set_nc}->($self);
3934     }
3935    
3936     redo A;
3937     } elsif ($self->{nc} == 0x003E) { # >
3938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3939    
3940 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3941    
3942     $self->{state} = DATA_STATE;
3943     $self->{s_kwd} = '';
3944     $self->{ct}->{quirks} = 1;
3945     } else {
3946    
3947     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3948     }
3949    
3950 wakaba 1.1
3951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3952     $self->{line_prev} = $self->{line};
3953     $self->{column_prev} = $self->{column};
3954     $self->{column}++;
3955     $self->{nc}
3956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3957     } else {
3958     $self->{set_nc}->($self);
3959     }
3960    
3961 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3962 wakaba 1.1 redo A;
3963     } elsif ($self->{nc} == -1) {
3964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3965    
3966 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3967    
3968     $self->{state} = DATA_STATE;
3969     $self->{s_kwd} = '';
3970     $self->{ct}->{quirks} = 1;
3971     } else {
3972    
3973     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974     }
3975    
3976 wakaba 1.1 ## reconsume
3977 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3978 wakaba 1.1 redo A;
3979     } else {
3980    
3981 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3982 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3983     length $self->{ct}->{pubid});
3984    
3985     ## Stay in the state
3986    
3987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988     $self->{line_prev} = $self->{line};
3989     $self->{column_prev} = $self->{column};
3990     $self->{column}++;
3991     $self->{nc}
3992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3993     } else {
3994     $self->{set_nc}->($self);
3995     }
3996    
3997     redo A;
3998     }
3999     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4000     if ($is_space->{$self->{nc}}) {
4001    
4002     ## Stay in the state
4003    
4004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4005     $self->{line_prev} = $self->{line};
4006     $self->{column_prev} = $self->{column};
4007     $self->{column}++;
4008     $self->{nc}
4009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4010     } else {
4011     $self->{set_nc}->($self);
4012     }
4013    
4014     redo A;
4015     } elsif ($self->{nc} == 0x0022) { # "
4016    
4017 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4018 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4019    
4020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021     $self->{line_prev} = $self->{line};
4022     $self->{column_prev} = $self->{column};
4023     $self->{column}++;
4024     $self->{nc}
4025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026     } else {
4027     $self->{set_nc}->($self);
4028     }
4029    
4030     redo A;
4031     } elsif ($self->{nc} == 0x0027) { # '
4032    
4033 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4034 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4035    
4036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4037     $self->{line_prev} = $self->{line};
4038     $self->{column_prev} = $self->{column};
4039     $self->{column}++;
4040     $self->{nc}
4041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4042     } else {
4043     $self->{set_nc}->($self);
4044     }
4045    
4046     redo A;
4047     } elsif ($self->{nc} == 0x003E) { # >
4048 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4049     if ($self->{is_xml}) {
4050    
4051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4052     } else {
4053    
4054     }
4055     $self->{state} = DATA_STATE;
4056     $self->{s_kwd} = '';
4057 wakaba 1.12 } else {
4058 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4059    
4060     } else {
4061    
4062     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4063     }
4064     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4065 wakaba 1.12 }
4066 wakaba 1.16
4067 wakaba 1.1
4068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069     $self->{line_prev} = $self->{line};
4070     $self->{column_prev} = $self->{column};
4071     $self->{column}++;
4072     $self->{nc}
4073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074     } else {
4075     $self->{set_nc}->($self);
4076     }
4077    
4078 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4079 wakaba 1.1 redo A;
4080     } elsif ($self->{nc} == -1) {
4081 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4082    
4083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4084    
4085     $self->{state} = DATA_STATE;
4086     $self->{s_kwd} = '';
4087     $self->{ct}->{quirks} = 1;
4088     } else {
4089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4090     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091     }
4092 wakaba 1.1
4093     ## reconsume
4094 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4095 wakaba 1.1 redo A;
4096 wakaba 1.16 } elsif ($self->{is_xml} and
4097     $self->{ct}->{type} == DOCTYPE_TOKEN and
4098     $self->{nc} == 0x005B) { # [
4099 wakaba 1.12
4100     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4101     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4102     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4103 wakaba 1.13 $self->{in_subset} = 1;
4104 wakaba 1.12
4105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4106     $self->{line_prev} = $self->{line};
4107     $self->{column_prev} = $self->{column};
4108     $self->{column}++;
4109     $self->{nc}
4110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4111     } else {
4112     $self->{set_nc}->($self);
4113     }
4114    
4115 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4116 wakaba 1.12 redo A;
4117 wakaba 1.1 } else {
4118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4119    
4120 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4121    
4122     $self->{ct}->{quirks} = 1;
4123     $self->{state} = BOGUS_DOCTYPE_STATE;
4124     } else {
4125    
4126     $self->{state} = BOGUS_MD_STATE;
4127     }
4128    
4129 wakaba 1.1
4130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4131     $self->{line_prev} = $self->{line};
4132     $self->{column_prev} = $self->{column};
4133     $self->{column}++;
4134     $self->{nc}
4135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4136     } else {
4137     $self->{set_nc}->($self);
4138     }
4139    
4140     redo A;
4141     }
4142     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4143     if ($is_space->{$self->{nc}}) {
4144    
4145     ## Stay in the state
4146    
4147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4148     $self->{line_prev} = $self->{line};
4149     $self->{column_prev} = $self->{column};
4150     $self->{column}++;
4151     $self->{nc}
4152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4153     } else {
4154     $self->{set_nc}->($self);
4155     }
4156    
4157     redo A;
4158     } elsif ($self->{nc} == 0x0022) { # "
4159    
4160     $self->{ct}->{sysid} = ''; # DOCTYPE
4161     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4162    
4163     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4164     $self->{line_prev} = $self->{line};
4165     $self->{column_prev} = $self->{column};
4166     $self->{column}++;
4167     $self->{nc}
4168     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4169     } else {
4170     $self->{set_nc}->($self);
4171     }
4172    
4173     redo A;
4174     } elsif ($self->{nc} == 0x0027) { # '
4175    
4176     $self->{ct}->{sysid} = ''; # DOCTYPE
4177     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4178    
4179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180     $self->{line_prev} = $self->{line};
4181     $self->{column_prev} = $self->{column};
4182     $self->{column}++;
4183     $self->{nc}
4184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4185     } else {
4186     $self->{set_nc}->($self);
4187     }
4188    
4189     redo A;
4190     } elsif ($self->{nc} == 0x003E) { # >
4191     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4192    
4193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4194     $self->{line_prev} = $self->{line};
4195     $self->{column_prev} = $self->{column};
4196     $self->{column}++;
4197     $self->{nc}
4198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4199     } else {
4200     $self->{set_nc}->($self);
4201     }
4202    
4203    
4204 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4205    
4206     $self->{state} = DATA_STATE;
4207     $self->{s_kwd} = '';
4208     $self->{ct}->{quirks} = 1;
4209     } else {
4210    
4211     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4212     }
4213 wakaba 1.1
4214 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4215 wakaba 1.1 redo A;
4216     } elsif ($self->{nc} == -1) {
4217 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4218    
4219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4220     $self->{state} = DATA_STATE;
4221     $self->{s_kwd} = '';
4222     $self->{ct}->{quirks} = 1;
4223     } else {
4224    
4225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4227     }
4228 wakaba 1.1
4229     ## reconsume
4230 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4231 wakaba 1.1 redo A;
4232 wakaba 1.16 } elsif ($self->{is_xml} and
4233     $self->{ct}->{type} == DOCTYPE_TOKEN and
4234     $self->{nc} == 0x005B) { # [
4235 wakaba 1.12
4236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4237    
4238     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4239     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4240 wakaba 1.13 $self->{in_subset} = 1;
4241 wakaba 1.12
4242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4243     $self->{line_prev} = $self->{line};
4244     $self->{column_prev} = $self->{column};
4245     $self->{column}++;
4246     $self->{nc}
4247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4248     } else {
4249     $self->{set_nc}->($self);
4250     }
4251    
4252 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4253 wakaba 1.12 redo A;
4254 wakaba 1.1 } else {
4255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4256    
4257 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4258    
4259     $self->{ct}->{quirks} = 1;
4260     $self->{state} = BOGUS_DOCTYPE_STATE;
4261     } else {
4262    
4263     $self->{state} = BOGUS_MD_STATE;
4264     }
4265    
4266 wakaba 1.1
4267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268     $self->{line_prev} = $self->{line};
4269     $self->{column_prev} = $self->{column};
4270     $self->{column}++;
4271     $self->{nc}
4272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4273     } else {
4274     $self->{set_nc}->($self);
4275     }
4276    
4277     redo A;
4278     }
4279     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4280     if ($self->{nc} == 0x0022) { # "
4281    
4282     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4283    
4284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4285     $self->{line_prev} = $self->{line};
4286     $self->{column_prev} = $self->{column};
4287     $self->{column}++;
4288     $self->{nc}
4289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4290     } else {
4291     $self->{set_nc}->($self);
4292     }
4293    
4294     redo A;
4295 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4296 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4297    
4298 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4299    
4300     $self->{state} = DATA_STATE;
4301     $self->{s_kwd} = '';
4302     $self->{ct}->{quirks} = 1;
4303     } else {
4304    
4305     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4306     }
4307    
4308 wakaba 1.1
4309     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4310     $self->{line_prev} = $self->{line};
4311     $self->{column_prev} = $self->{column};
4312     $self->{column}++;
4313     $self->{nc}
4314     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4315     } else {
4316     $self->{set_nc}->($self);
4317     }
4318    
4319 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4320 wakaba 1.1 redo A;
4321     } elsif ($self->{nc} == -1) {
4322     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4323    
4324 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4325    
4326     $self->{state} = DATA_STATE;
4327     $self->{s_kwd} = '';
4328     $self->{ct}->{quirks} = 1;
4329     } else {
4330    
4331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332     }
4333    
4334 wakaba 1.1 ## reconsume
4335 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4336 wakaba 1.1 redo A;
4337     } else {
4338    
4339 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4340 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4341     length $self->{ct}->{sysid});
4342    
4343     ## Stay in the state
4344    
4345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346     $self->{line_prev} = $self->{line};
4347     $self->{column_prev} = $self->{column};
4348     $self->{column}++;
4349     $self->{nc}
4350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351     } else {
4352     $self->{set_nc}->($self);
4353     }
4354    
4355     redo A;
4356     }
4357     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4358     if ($self->{nc} == 0x0027) { # '
4359    
4360     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4361    
4362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4363     $self->{line_prev} = $self->{line};
4364     $self->{column_prev} = $self->{column};
4365     $self->{column}++;
4366     $self->{nc}
4367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4368     } else {
4369     $self->{set_nc}->($self);
4370     }
4371    
4372     redo A;
4373 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4374 wakaba 1.1
4375     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4376    
4377     $self->{state} = DATA_STATE;
4378 wakaba 1.5 $self->{s_kwd} = '';
4379 wakaba 1.1
4380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4381     $self->{line_prev} = $self->{line};
4382     $self->{column_prev} = $self->{column};
4383     $self->{column}++;
4384     $self->{nc}
4385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4386     } else {
4387     $self->{set_nc}->($self);
4388     }
4389    
4390    
4391     $self->{ct}->{quirks} = 1;
4392     return ($self->{ct}); # DOCTYPE
4393    
4394     redo A;
4395     } elsif ($self->{nc} == -1) {
4396     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4397    
4398 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4399    
4400     $self->{state} = DATA_STATE;
4401     $self->{s_kwd} = '';
4402     $self->{ct}->{quirks} = 1;
4403     } else {
4404    
4405     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4406     }
4407    
4408 wakaba 1.1 ## reconsume
4409 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4410 wakaba 1.1 redo A;
4411     } else {
4412    
4413 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4414 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4415     length $self->{ct}->{sysid});
4416    
4417     ## Stay in the state
4418    
4419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420     $self->{line_prev} = $self->{line};
4421     $self->{column_prev} = $self->{column};
4422     $self->{column}++;
4423     $self->{nc}
4424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425     } else {
4426     $self->{set_nc}->($self);
4427     }
4428    
4429     redo A;
4430     }
4431     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4432     if ($is_space->{$self->{nc}}) {
4433 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4434    
4435     $self->{state} = BEFORE_NDATA_STATE;
4436     } else {
4437    
4438     ## Stay in the state
4439     }
4440 wakaba 1.1
4441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442     $self->{line_prev} = $self->{line};
4443     $self->{column_prev} = $self->{column};
4444     $self->{column}++;
4445     $self->{nc}
4446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4447     } else {
4448     $self->{set_nc}->($self);
4449     }
4450    
4451     redo A;
4452     } elsif ($self->{nc} == 0x003E) { # >
4453 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4454    
4455     $self->{state} = DATA_STATE;
4456     $self->{s_kwd} = '';
4457     } else {
4458    
4459     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4460     }
4461    
4462 wakaba 1.1
4463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4464     $self->{line_prev} = $self->{line};
4465     $self->{column_prev} = $self->{column};
4466     $self->{column}++;
4467     $self->{nc}
4468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4469     } else {
4470     $self->{set_nc}->($self);
4471     }
4472    
4473 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4474 wakaba 1.1 redo A;
4475 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4476     ($self->{nc} == 0x004E or # N
4477     $self->{nc} == 0x006E)) { # n
4478    
4479     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4480     $self->{state} = NDATA_STATE;
4481     $self->{kwd} = chr $self->{nc};
4482    
4483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4484     $self->{line_prev} = $self->{line};
4485     $self->{column_prev} = $self->{column};
4486     $self->{column}++;
4487     $self->{nc}
4488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4489     } else {
4490     $self->{set_nc}->($self);
4491     }
4492    
4493     redo A;
4494 wakaba 1.1 } elsif ($self->{nc} == -1) {
4495 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4496    
4497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4498     $self->{state} = DATA_STATE;
4499     $self->{s_kwd} = '';
4500     $self->{ct}->{quirks} = 1;
4501     } else {
4502    
4503     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4504     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505     }
4506    
4507 wakaba 1.1 ## reconsume
4508 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509 wakaba 1.1 redo A;
4510 wakaba 1.16 } elsif ($self->{is_xml} and
4511     $self->{ct}->{type} == DOCTYPE_TOKEN and
4512     $self->{nc} == 0x005B) { # [
4513 wakaba 1.12
4514     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4515     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4516 wakaba 1.13 $self->{in_subset} = 1;
4517 wakaba 1.12
4518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519     $self->{line_prev} = $self->{line};
4520     $self->{column_prev} = $self->{column};
4521     $self->{column}++;
4522     $self->{nc}
4523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4524     } else {
4525     $self->{set_nc}->($self);
4526     }
4527    
4528 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4529 wakaba 1.12 redo A;
4530 wakaba 1.1 } else {
4531     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4532    
4533 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4534    
4535     #$self->{ct}->{quirks} = 1;
4536     $self->{state} = BOGUS_DOCTYPE_STATE;
4537     } else {
4538    
4539     $self->{state} = BOGUS_MD_STATE;
4540     }
4541    
4542 wakaba 1.1
4543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4544     $self->{line_prev} = $self->{line};
4545     $self->{column_prev} = $self->{column};
4546     $self->{column}++;
4547     $self->{nc}
4548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4549     } else {
4550     $self->{set_nc}->($self);
4551     }
4552    
4553     redo A;
4554     }
4555 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4556     if ($is_space->{$self->{nc}}) {
4557    
4558     ## Stay in the state.
4559    
4560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4561     $self->{line_prev} = $self->{line};
4562     $self->{column_prev} = $self->{column};
4563     $self->{column}++;
4564     $self->{nc}
4565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4566     } else {
4567     $self->{set_nc}->($self);
4568     }
4569    
4570     redo A;
4571     } elsif ($self->{nc} == 0x003E) { # >
4572    
4573     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4574    
4575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4576     $self->{line_prev} = $self->{line};
4577     $self->{column_prev} = $self->{column};
4578     $self->{column}++;
4579     $self->{nc}
4580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4581     } else {
4582     $self->{set_nc}->($self);
4583     }
4584    
4585     return ($self->{ct}); # ENTITY
4586     redo A;
4587     } elsif ($self->{nc} == 0x004E or # N
4588     $self->{nc} == 0x006E) { # n
4589    
4590     $self->{state} = NDATA_STATE;
4591     $self->{kwd} = chr $self->{nc};
4592    
4593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4594     $self->{line_prev} = $self->{line};
4595     $self->{column_prev} = $self->{column};
4596     $self->{column}++;
4597     $self->{nc}
4598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4599     } else {
4600     $self->{set_nc}->($self);
4601     }
4602    
4603     redo A;
4604     } elsif ($self->{nc} == -1) {
4605    
4606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4607     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4608     ## reconsume
4609     return ($self->{ct}); # ENTITY
4610     redo A;
4611     } else {
4612    
4613     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4614     $self->{state} = BOGUS_MD_STATE;
4615    
4616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4617     $self->{line_prev} = $self->{line};
4618     $self->{column_prev} = $self->{column};
4619     $self->{column}++;
4620     $self->{nc}
4621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4622     } else {
4623     $self->{set_nc}->($self);
4624     }
4625    
4626     redo A;
4627     }
4628 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4629     if ($self->{nc} == 0x003E) { # >
4630    
4631     $self->{state} = DATA_STATE;
4632 wakaba 1.5 $self->{s_kwd} = '';
4633 wakaba 1.1
4634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635     $self->{line_prev} = $self->{line};
4636     $self->{column_prev} = $self->{column};
4637     $self->{column}++;
4638     $self->{nc}
4639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4640     } else {
4641     $self->{set_nc}->($self);
4642     }
4643    
4644    
4645     return ($self->{ct}); # DOCTYPE
4646    
4647     redo A;
4648 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4649 wakaba 1.13
4650     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4651     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4652     $self->{in_subset} = 1;
4653    
4654 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4655     $self->{line_prev} = $self->{line};
4656     $self->{column_prev} = $self->{column};
4657     $self->{column}++;
4658     $self->{nc}
4659     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4660     } else {
4661     $self->{set_nc}->($self);
4662     }
4663    
4664 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4665     redo A;
4666 wakaba 1.1 } elsif ($self->{nc} == -1) {
4667    
4668     $self->{state} = DATA_STATE;
4669 wakaba 1.5 $self->{s_kwd} = '';
4670 wakaba 1.1 ## reconsume
4671    
4672     return ($self->{ct}); # DOCTYPE
4673    
4674     redo A;
4675     } else {
4676    
4677     my $s = '';
4678 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4679 wakaba 1.1
4680     ## Stay in the state
4681    
4682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4683     $self->{line_prev} = $self->{line};
4684     $self->{column_prev} = $self->{column};
4685     $self->{column}++;
4686     $self->{nc}
4687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4688     } else {
4689     $self->{set_nc}->($self);
4690     }
4691    
4692     redo A;
4693     }
4694     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4695     ## NOTE: "CDATA section state" in the state is jointly implemented
4696     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4697     ## and |CDATA_SECTION_MSE2_STATE|.
4698 wakaba 1.10
4699     ## XML5: "CDATA state".
4700 wakaba 1.1
4701     if ($self->{nc} == 0x005D) { # ]
4702    
4703     $self->{state} = CDATA_SECTION_MSE1_STATE;
4704    
4705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4706     $self->{line_prev} = $self->{line};
4707     $self->{column_prev} = $self->{column};
4708     $self->{column}++;
4709     $self->{nc}
4710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4711     } else {
4712     $self->{set_nc}->($self);
4713     }
4714    
4715     redo A;
4716     } elsif ($self->{nc} == -1) {
4717 wakaba 1.6 if ($self->{is_xml}) {
4718 wakaba 1.8
4719 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4720 wakaba 1.8 } else {
4721    
4722 wakaba 1.6 }
4723    
4724 wakaba 1.1 $self->{state} = DATA_STATE;
4725 wakaba 1.5 $self->{s_kwd} = '';
4726 wakaba 1.10 ## Reconsume.
4727 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4728    
4729     return ($self->{ct}); # character
4730     } else {
4731    
4732     ## No token to emit. $self->{ct} is discarded.
4733     }
4734     redo A;
4735     } else {
4736    
4737     $self->{ct}->{data} .= chr $self->{nc};
4738     $self->{read_until}->($self->{ct}->{data},
4739     q<]>,
4740     length $self->{ct}->{data});
4741    
4742     ## Stay in the state.
4743    
4744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4745     $self->{line_prev} = $self->{line};
4746     $self->{column_prev} = $self->{column};
4747     $self->{column}++;
4748     $self->{nc}
4749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4750     } else {
4751     $self->{set_nc}->($self);
4752     }
4753    
4754     redo A;
4755     }
4756    
4757     ## ISSUE: "text tokens" in spec.
4758     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4759 wakaba 1.10 ## XML5: "CDATA bracket state".
4760    
4761 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4762    
4763     $self->{state} = CDATA_SECTION_MSE2_STATE;
4764    
4765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4766     $self->{line_prev} = $self->{line};
4767     $self->{column_prev} = $self->{column};
4768     $self->{column}++;
4769     $self->{nc}
4770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4771     } else {
4772     $self->{set_nc}->($self);
4773     }
4774    
4775     redo A;
4776     } else {
4777    
4778 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4779 wakaba 1.1 $self->{ct}->{data} .= ']';
4780 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4781 wakaba 1.1 ## Reconsume.
4782     redo A;
4783     }
4784     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4785 wakaba 1.10 ## XML5: "CDATA end state".
4786    
4787 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4788     $self->{state} = DATA_STATE;
4789 wakaba 1.5 $self->{s_kwd} = '';
4790 wakaba 1.1
4791     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4792     $self->{line_prev} = $self->{line};
4793     $self->{column_prev} = $self->{column};
4794     $self->{column}++;
4795     $self->{nc}
4796     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4797     } else {
4798     $self->{set_nc}->($self);
4799     }
4800    
4801     if (length $self->{ct}->{data}) { # character
4802    
4803     return ($self->{ct}); # character
4804     } else {
4805    
4806     ## No token to emit. $self->{ct} is discarded.
4807     }
4808     redo A;
4809     } elsif ($self->{nc} == 0x005D) { # ]
4810     # character
4811     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4812     ## Stay in the state.
4813    
4814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4815     $self->{line_prev} = $self->{line};
4816     $self->{column_prev} = $self->{column};
4817     $self->{column}++;
4818     $self->{nc}
4819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4820     } else {
4821     $self->{set_nc}->($self);
4822     }
4823    
4824     redo A;
4825     } else {
4826    
4827     $self->{ct}->{data} .= ']]'; # character
4828     $self->{state} = CDATA_SECTION_STATE;
4829 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4830 wakaba 1.1 redo A;
4831     }
4832     } elsif ($self->{state} == ENTITY_STATE) {
4833     if ($is_space->{$self->{nc}} or
4834     {
4835     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4836     $self->{entity_add} => 1,
4837     }->{$self->{nc}}) {
4838 wakaba 1.22 if ($self->{is_xml}) {
4839    
4840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4841     line => $self->{line_prev},
4842     column => $self->{column_prev}
4843     + ($self->{nc} == -1 ? 1 : 0));
4844     } else {
4845    
4846     ## No error
4847     }
4848 wakaba 1.1 ## Don't consume
4849     ## Return nothing.
4850     #
4851     } elsif ($self->{nc} == 0x0023) { # #
4852    
4853     $self->{state} = ENTITY_HASH_STATE;
4854 wakaba 1.12 $self->{kwd} = '#';
4855 wakaba 1.1
4856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4857     $self->{line_prev} = $self->{line};
4858     $self->{column_prev} = $self->{column};
4859     $self->{column}++;
4860     $self->{nc}
4861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4862     } else {
4863     $self->{set_nc}->($self);
4864     }
4865    
4866     redo A;
4867 wakaba 1.22 } elsif ($self->{is_xml} or
4868     (0x0041 <= $self->{nc} and
4869 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4870     (0x0061 <= $self->{nc} and
4871     $self->{nc} <= 0x007A)) { # a..z
4872    
4873     require Whatpm::_NamedEntityList;
4874     $self->{state} = ENTITY_NAME_STATE;
4875 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4876     $self->{entity__value} = $self->{kwd};
4877 wakaba 1.1 $self->{entity__match} = 0;
4878    
4879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4880     $self->{line_prev} = $self->{line};
4881     $self->{column_prev} = $self->{column};
4882     $self->{column}++;
4883     $self->{nc}
4884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4885     } else {
4886     $self->{set_nc}->($self);
4887     }
4888    
4889     redo A;
4890     } else {
4891    
4892     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4893     ## Return nothing.
4894     #
4895     }
4896    
4897     ## NOTE: No character is consumed by the "consume a character
4898     ## reference" algorithm. In other word, there is an "&" character
4899     ## that does not introduce a character reference, which would be
4900     ## appended to the parent element or the attribute value in later
4901     ## process of the tokenizer.
4902    
4903     if ($self->{prev_state} == DATA_STATE) {
4904    
4905     $self->{state} = $self->{prev_state};
4906 wakaba 1.5 $self->{s_kwd} = '';
4907 wakaba 1.1 ## Reconsume.
4908     return ({type => CHARACTER_TOKEN, data => '&',
4909     line => $self->{line_prev},
4910     column => $self->{column_prev},
4911     });
4912     redo A;
4913     } else {
4914    
4915     $self->{ca}->{value} .= '&';
4916     $self->{state} = $self->{prev_state};
4917 wakaba 1.5 $self->{s_kwd} = '';
4918 wakaba 1.1 ## Reconsume.
4919     redo A;
4920     }
4921     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4922 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4923 wakaba 1.1
4924     $self->{state} = HEXREF_X_STATE;
4925 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4926 wakaba 1.1
4927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4928     $self->{line_prev} = $self->{line};
4929     $self->{column_prev} = $self->{column};
4930     $self->{column}++;
4931     $self->{nc}
4932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4933     } else {
4934     $self->{set_nc}->($self);
4935     }
4936    
4937     redo A;
4938 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4939    
4940     if ($self->{is_xml}) {
4941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4942     }
4943     $self->{state} = HEXREF_X_STATE;
4944     $self->{kwd} .= chr $self->{nc};
4945    
4946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4947     $self->{line_prev} = $self->{line};
4948     $self->{column_prev} = $self->{column};
4949     $self->{column}++;
4950     $self->{nc}
4951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4952     } else {
4953     $self->{set_nc}->($self);
4954     }
4955    
4956     redo A;
4957 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4958     $self->{nc} <= 0x0039) { # 0..9
4959    
4960     $self->{state} = NCR_NUM_STATE;
4961 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4962 wakaba 1.1
4963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964     $self->{line_prev} = $self->{line};
4965     $self->{column_prev} = $self->{column};
4966     $self->{column}++;
4967     $self->{nc}
4968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969     } else {
4970     $self->{set_nc}->($self);
4971     }
4972    
4973     redo A;
4974     } else {
4975     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4976     line => $self->{line_prev},
4977     column => $self->{column_prev} - 1);
4978    
4979     ## NOTE: According to the spec algorithm, nothing is returned,
4980     ## and then "&#" is appended to the parent element or the attribute
4981     ## value in the later processing.
4982    
4983     if ($self->{prev_state} == DATA_STATE) {
4984    
4985     $self->{state} = $self->{prev_state};
4986 wakaba 1.5 $self->{s_kwd} = '';
4987 wakaba 1.1 ## Reconsume.
4988     return ({type => CHARACTER_TOKEN,
4989     data => '&#',
4990     line => $self->{line_prev},
4991     column => $self->{column_prev} - 1,
4992     });
4993     redo A;
4994     } else {
4995    
4996     $self->{ca}->{value} .= '&#';
4997     $self->{state} = $self->{prev_state};
4998 wakaba 1.5 $self->{s_kwd} = '';
4999 wakaba 1.1 ## Reconsume.
5000     redo A;
5001     }
5002     }
5003     } elsif ($self->{state} == NCR_NUM_STATE) {
5004     if (0x0030 <= $self->{nc} and
5005     $self->{nc} <= 0x0039) { # 0..9
5006    
5007 wakaba 1.12 $self->{kwd} *= 10;
5008     $self->{kwd} += $self->{nc} - 0x0030;
5009 wakaba 1.1
5010     ## Stay in the state.
5011    
5012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5013     $self->{line_prev} = $self->{line};
5014     $self->{column_prev} = $self->{column};
5015     $self->{column}++;
5016     $self->{nc}
5017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5018     } else {
5019     $self->{set_nc}->($self);
5020     }
5021    
5022     redo A;
5023     } elsif ($self->{nc} == 0x003B) { # ;
5024    
5025    
5026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5027     $self->{line_prev} = $self->{line};
5028     $self->{column_prev} = $self->{column};
5029     $self->{column}++;
5030     $self->{nc}
5031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5032     } else {
5033     $self->{set_nc}->($self);
5034     }
5035    
5036     #
5037     } else {
5038    
5039     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5040     ## Reconsume.
5041     #
5042     }
5043    
5044 wakaba 1.12 my $code = $self->{kwd};
5045 wakaba 1.1 my $l = $self->{line_prev};
5046     my $c = $self->{column_prev};
5047 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5048     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5049     ($self->{is_xml} and $code == 0x0000)) {
5050 wakaba 1.1
5051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5052     text => (sprintf 'U+%04X', $code),
5053     line => $l, column => $c);
5054     $code = $charref_map->{$code};
5055     } elsif ($code > 0x10FFFF) {
5056    
5057     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5058     text => (sprintf 'U-%08X', $code),
5059     line => $l, column => $c);
5060     $code = 0xFFFD;
5061     }
5062    
5063     if ($self->{prev_state} == DATA_STATE) {
5064    
5065     $self->{state} = $self->{prev_state};
5066 wakaba 1.5 $self->{s_kwd} = '';
5067 wakaba 1.1 ## Reconsume.
5068     return ({type => CHARACTER_TOKEN, data => chr $code,
5069 wakaba 1.7 has_reference => 1,
5070 wakaba 1.1 line => $l, column => $c,
5071     });
5072     redo A;
5073     } else {
5074    
5075     $self->{ca}->{value} .= chr $code;
5076     $self->{ca}->{has_reference} = 1;
5077     $self->{state} = $self->{prev_state};
5078 wakaba 1.5 $self->{s_kwd} = '';
5079 wakaba 1.1 ## Reconsume.
5080     redo A;
5081     }
5082     } elsif ($self->{state} == HEXREF_X_STATE) {
5083     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5084     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5085     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5086     # 0..9, A..F, a..f
5087    
5088     $self->{state} = HEXREF_HEX_STATE;
5089 wakaba 1.12 $self->{kwd} = 0;
5090 wakaba 1.1 ## Reconsume.
5091     redo A;
5092     } else {
5093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5094     line => $self->{line_prev},
5095     column => $self->{column_prev} - 2);
5096    
5097     ## NOTE: According to the spec algorithm, nothing is returned,
5098     ## and then "&#" followed by "X" or "x" is appended to the parent
5099     ## element or the attribute value in the later processing.
5100    
5101     if ($self->{prev_state} == DATA_STATE) {
5102    
5103     $self->{state} = $self->{prev_state};
5104 wakaba 1.5 $self->{s_kwd} = '';
5105 wakaba 1.1 ## Reconsume.
5106     return ({type => CHARACTER_TOKEN,
5107 wakaba 1.12 data => '&' . $self->{kwd},
5108 wakaba 1.1 line => $self->{line_prev},
5109 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
5110 wakaba 1.1 });
5111     redo A;
5112     } else {
5113    
5114 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
5115 wakaba 1.1 $self->{state} = $self->{prev_state};
5116 wakaba 1.5 $self->{s_kwd} = '';
5117 wakaba 1.1 ## Reconsume.
5118     redo A;
5119     }
5120     }
5121     } elsif ($self->{state} == HEXREF_HEX_STATE) {
5122     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5123     # 0..9
5124    
5125 wakaba 1.12 $self->{kwd} *= 0x10;
5126     $self->{kwd} += $self->{nc} - 0x0030;
5127 wakaba 1.1 ## Stay in the state.
5128    
5129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5130     $self->{line_prev} = $self->{line};
5131     $self->{column_prev} = $self->{column};
5132     $self->{column}++;
5133     $self->{nc}
5134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5135     } else {
5136     $self->{set_nc}->($self);
5137     }
5138    
5139     redo A;
5140     } elsif (0x0061 <= $self->{nc} and
5141     $self->{nc} <= 0x0066) { # a..f
5142    
5143 wakaba 1.12 $self->{kwd} *= 0x10;
5144     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5145 wakaba 1.1 ## Stay in the state.
5146    
5147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5148     $self->{line_prev} = $self->{line};
5149     $self->{column_prev} = $self->{column};
5150     $self->{column}++;
5151     $self->{nc}
5152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5153     } else {
5154     $self->{set_nc}->($self);
5155     }
5156    
5157     redo A;
5158     } elsif (0x0041 <= $self->{nc} and
5159     $self->{nc} <= 0x0046) { # A..F
5160    
5161 wakaba 1.12 $self->{kwd} *= 0x10;
5162     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5163 wakaba 1.1 ## Stay in the state.
5164    
5165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5166     $self->{line_prev} = $self->{line};
5167     $self->{column_prev} = $self->{column};
5168     $self->{column}++;
5169     $self->{nc}
5170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5171     } else {
5172     $self->{set_nc}->($self);
5173     }
5174    
5175     redo A;
5176     } elsif ($self->{nc} == 0x003B) { # ;
5177    
5178    
5179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5180     $self->{line_prev} = $self->{line};
5181     $self->{column_prev} = $self->{column};
5182     $self->{column}++;
5183     $self->{nc}
5184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5185     } else {
5186     $self->{set_nc}->($self);
5187     }
5188    
5189     #
5190     } else {
5191    
5192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5193     line => $self->{line},
5194     column => $self->{column});
5195     ## Reconsume.
5196     #
5197     }
5198    
5199 wakaba 1.12 my $code = $self->{kwd};
5200 wakaba 1.1 my $l = $self->{line_prev};
5201     my $c = $self->{column_prev};
5202 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5203     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5204     ($self->{is_xml} and $code == 0x0000)) {
5205 wakaba 1.1
5206     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5207     text => (sprintf 'U+%04X', $code),
5208     line => $l, column => $c);
5209     $code = $charref_map->{$code};
5210     } elsif ($code > 0x10FFFF) {
5211    
5212     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5213     text => (sprintf 'U-%08X', $code),
5214     line => $l, column => $c);
5215     $code = 0xFFFD;
5216     }
5217    
5218     if ($self->{prev_state} == DATA_STATE) {
5219    
5220     $self->{state} = $self->{prev_state};
5221 wakaba 1.5 $self->{s_kwd} = '';
5222 wakaba 1.1 ## Reconsume.
5223     return ({type => CHARACTER_TOKEN, data => chr $code,
5224 wakaba 1.7 has_reference => 1,
5225 wakaba 1.1 line => $l, column => $c,
5226     });
5227     redo A;
5228     } else {
5229    
5230     $self->{ca}->{value} .= chr $code;
5231     $self->{ca}->{has_reference} = 1;
5232     $self->{state} = $self->{prev_state};
5233 wakaba 1.5 $self->{s_kwd} = '';
5234 wakaba 1.1 ## Reconsume.
5235     redo A;
5236     }
5237     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5238 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5239     $self->{nc} <= 0x005A) or # x
5240     (0x0061 <= $self->{nc} and # a
5241     $self->{nc} <= 0x007A) or # z
5242     (0x0030 <= $self->{nc} and # 0
5243     $self->{nc} <= 0x0039) or # 9
5244 wakaba 1.22 $self->{nc} == 0x003B or # ;
5245     ($self->{is_xml} and
5246     not ($is_space->{$self->{nc}} or
5247     {
5248     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5249     $self->{entity_add} => 1,
5250     }->{$self->{nc}}))) {
5251 wakaba 1.1 our $EntityChar;
5252 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5253 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5254     $self->{ge}->{$self->{kwd}}) {
5255 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5256 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5257     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5258    
5259     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5260     } else {
5261     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5262    
5263     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5264     value => $self->{kwd});
5265     } else {
5266    
5267     }
5268     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5269     }
5270     } else {
5271     if ($self->{is_xml}) {
5272    
5273     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5274     value => $self->{kwd},
5275     level => {
5276     'amp;' => $self->{level}->{warn},
5277     'quot;' => $self->{level}->{warn},
5278     'lt;' => $self->{level}->{warn},
5279     'gt;' => $self->{level}->{warn},
5280     'apos;' => $self->{level}->{warn},
5281     }->{$self->{kwd}} ||
5282     $self->{level}->{must});
5283     } else {
5284    
5285     }
5286     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5287     }
5288 wakaba 1.1 $self->{entity__match} = 1;
5289    
5290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5291     $self->{line_prev} = $self->{line};
5292     $self->{column_prev} = $self->{column};
5293     $self->{column}++;
5294     $self->{nc}
5295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5296     } else {
5297     $self->{set_nc}->($self);
5298     }
5299    
5300     #
5301     } else {
5302    
5303 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5304 wakaba 1.1 $self->{entity__match} = -1;
5305     ## Stay in the state.
5306    
5307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5308     $self->{line_prev} = $self->{line};
5309     $self->{column_prev} = $self->{column};
5310     $self->{column}++;
5311     $self->{nc}
5312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5313     } else {
5314     $self->{set_nc}->($self);
5315     }
5316    
5317     redo A;
5318     }
5319     } else {
5320    
5321     $self->{entity__value} .= chr $self->{nc};
5322     $self->{entity__match} *= 2;
5323     ## Stay in the state.
5324    
5325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326     $self->{line_prev} = $self->{line};
5327     $self->{column_prev} = $self->{column};
5328     $self->{column}++;
5329     $self->{nc}
5330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331     } else {
5332     $self->{set_nc}->($self);
5333     }
5334    
5335     redo A;
5336     }
5337     }
5338    
5339     my $data;
5340     my $has_ref;
5341     if ($self->{entity__match} > 0) {
5342    
5343     $data = $self->{entity__value};
5344     $has_ref = 1;
5345     #
5346     } elsif ($self->{entity__match} < 0) {
5347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5348     if ($self->{prev_state} != DATA_STATE and # in attribute
5349     $self->{entity__match} < -1) {
5350    
5351 wakaba 1.12 $data = '&' . $self->{kwd};
5352 wakaba 1.1 #
5353     } else {
5354    
5355     $data = $self->{entity__value};
5356     $has_ref = 1;
5357     #
5358     }
5359     } else {
5360    
5361     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5362     line => $self->{line_prev},
5363 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5364     $data = '&' . $self->{kwd};
5365 wakaba 1.1 #
5366     }
5367    
5368     ## NOTE: In these cases, when a character reference is found,
5369     ## it is consumed and a character token is returned, or, otherwise,
5370     ## nothing is consumed and returned, according to the spec algorithm.
5371     ## In this implementation, anything that has been examined by the
5372     ## tokenizer is appended to the parent element or the attribute value
5373     ## as string, either literal string when no character reference or
5374     ## entity-replaced string otherwise, in this stage, since any characters
5375     ## that would not be consumed are appended in the data state or in an
5376     ## appropriate attribute value state anyway.
5377    
5378     if ($self->{prev_state} == DATA_STATE) {
5379    
5380     $self->{state} = $self->{prev_state};
5381 wakaba 1.5 $self->{s_kwd} = '';
5382 wakaba 1.1 ## Reconsume.
5383     return ({type => CHARACTER_TOKEN,
5384     data => $data,
5385 wakaba 1.7 has_reference => $has_ref,
5386 wakaba 1.1 line => $self->{line_prev},
5387 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5388 wakaba 1.1 });
5389     redo A;
5390     } else {
5391    
5392     $self->{ca}->{value} .= $data;
5393     $self->{ca}->{has_reference} = 1 if $has_ref;
5394     $self->{state} = $self->{prev_state};
5395 wakaba 1.5 $self->{s_kwd} = '';
5396 wakaba 1.1 ## Reconsume.
5397     redo A;
5398     }
5399 wakaba 1.8
5400     ## XML-only states
5401    
5402     } elsif ($self->{state} == PI_STATE) {
5403 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5404    
5405 wakaba 1.8 if ($is_space->{$self->{nc}} or
5406 wakaba 1.14 $self->{nc} == 0x003F or # ?
5407 wakaba 1.8 $self->{nc} == -1) {
5408 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5409     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5410     ## "DOCTYPE pi state": Parse error, switch to the "data
5411     ## state".
5412 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5413     line => $self->{line_prev},
5414     column => $self->{column_prev}
5415     - 1 * ($self->{nc} != -1));
5416     $self->{state} = BOGUS_COMMENT_STATE;
5417     ## Reconsume.
5418     $self->{ct} = {type => COMMENT_TOKEN,
5419     data => '?',
5420     line => $self->{line_prev},
5421     column => $self->{column_prev}
5422     - 1 * ($self->{nc} != -1),
5423     };
5424     redo A;
5425     } else {
5426 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5427 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5428     target => chr $self->{nc},
5429     data => '',
5430     line => $self->{line_prev},
5431     column => $self->{column_prev} - 1,
5432     };
5433     $self->{state} = PI_TARGET_STATE;
5434    
5435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436     $self->{line_prev} = $self->{line};
5437     $self->{column_prev} = $self->{column};
5438     $self->{column}++;
5439     $self->{nc}
5440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441     } else {
5442     $self->{set_nc}->($self);
5443     }
5444    
5445     redo A;
5446     }
5447     } elsif ($self->{state} == PI_TARGET_STATE) {
5448     if ($is_space->{$self->{nc}}) {
5449     $self->{state} = PI_TARGET_AFTER_STATE;
5450    
5451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5452     $self->{line_prev} = $self->{line};
5453     $self->{column_prev} = $self->{column};
5454     $self->{column}++;
5455     $self->{nc}
5456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5457     } else {
5458     $self->{set_nc}->($self);
5459     }
5460    
5461     redo A;
5462     } elsif ($self->{nc} == -1) {
5463     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5464 wakaba 1.13 if ($self->{in_subset}) {
5465     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5466     } else {
5467     $self->{state} = DATA_STATE;
5468     $self->{s_kwd} = '';
5469     }
5470 wakaba 1.8 ## Reconsume.
5471     return ($self->{ct}); # pi
5472     redo A;
5473     } elsif ($self->{nc} == 0x003F) { # ?
5474     $self->{state} = PI_AFTER_STATE;
5475    
5476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5477     $self->{line_prev} = $self->{line};
5478     $self->{column_prev} = $self->{column};
5479     $self->{column}++;
5480     $self->{nc}
5481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5482     } else {
5483     $self->{set_nc}->($self);
5484     }
5485    
5486     redo A;
5487     } else {
5488     ## XML5: typo ("tag name" -> "target")
5489     $self->{ct}->{target} .= chr $self->{nc}; # pi
5490    
5491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5492     $self->{line_prev} = $self->{line};
5493     $self->{column_prev} = $self->{column};
5494     $self->{column}++;
5495     $self->{nc}
5496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5497     } else {
5498     $self->{set_nc}->($self);
5499     }
5500    
5501     redo A;
5502     }
5503     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5504     if ($is_space->{$self->{nc}}) {
5505     ## Stay in the state.
5506    
5507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5508     $self->{line_prev} = $self->{line};
5509     $self->{column_prev} = $self->{column};
5510     $self->{column}++;
5511     $self->{nc}
5512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5513     } else {
5514     $self->{set_nc}->($self);
5515     }
5516    
5517     redo A;
5518     } else {
5519     $self->{state} = PI_DATA_STATE;
5520     ## Reprocess.
5521     redo A;
5522     }
5523     } elsif ($self->{state} == PI_DATA_STATE) {
5524     if ($self->{nc} == 0x003F) { # ?
5525     $self->{state} = PI_DATA_AFTER_STATE;
5526    
5527     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5528     $self->{line_prev} = $self->{line};
5529     $self->{column_prev} = $self->{column};
5530     $self->{column}++;
5531     $self->{nc}
5532     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5533     } else {
5534     $self->{set_nc}->($self);
5535     }
5536    
5537     redo A;
5538     } elsif ($self->{nc} == -1) {
5539     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5540 wakaba 1.13 if ($self->{in_subset}) {
5541 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5542 wakaba 1.13 } else {
5543     $self->{state} = DATA_STATE;
5544     $self->{s_kwd} = '';
5545     }
5546 wakaba 1.8 ## Reprocess.
5547     return ($self->{ct}); # pi
5548     redo A;
5549     } else {
5550     $self->{ct}->{data} .= chr $self->{nc}; # pi
5551     $self->{read_until}->($self->{ct}->{data}, q[?],
5552     length $self->{ct}->{data});
5553     ## Stay in the state.
5554    
5555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5556     $self->{line_prev} = $self->{line};
5557     $self->{column_prev} = $self->{column};
5558     $self->{column}++;
5559     $self->{nc}
5560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5561     } else {
5562     $self->{set_nc}->($self);
5563     }
5564    
5565     ## Reprocess.
5566     redo A;
5567     }
5568     } elsif ($self->{state} == PI_AFTER_STATE) {
5569 wakaba 1.14 ## XML5: Part of "Pi after state".
5570    
5571 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5572 wakaba 1.13 if ($self->{in_subset}) {
5573     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5574     } else {
5575     $self->{state} = DATA_STATE;
5576     $self->{s_kwd} = '';
5577     }
5578 wakaba 1.8
5579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5580     $self->{line_prev} = $self->{line};
5581     $self->{column_prev} = $self->{column};
5582     $self->{column}++;
5583     $self->{nc}
5584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5585     } else {
5586     $self->{set_nc}->($self);
5587     }
5588    
5589     return ($self->{ct}); # pi
5590     redo A;
5591     } elsif ($self->{nc} == 0x003F) { # ?
5592     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5593     line => $self->{line_prev},
5594     column => $self->{column_prev}); ## XML5: no error
5595     $self->{ct}->{data} .= '?';
5596     $self->{state} = PI_DATA_AFTER_STATE;
5597    
5598     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5599     $self->{line_prev} = $self->{line};
5600     $self->{column_prev} = $self->{column};
5601     $self->{column}++;
5602     $self->{nc}
5603     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5604     } else {
5605     $self->{set_nc}->($self);
5606     }
5607    
5608     redo A;
5609     } else {
5610     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5611     line => $self->{line_prev},
5612     column => $self->{column_prev}
5613     + 1 * ($self->{nc} == -1)); ## XML5: no error
5614     $self->{ct}->{data} .= '?'; ## XML5: not appended
5615     $self->{state} = PI_DATA_STATE;
5616     ## Reprocess.
5617     redo A;
5618     }
5619     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5620 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5621    
5622 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5623 wakaba 1.13 if ($self->{in_subset}) {
5624     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5625     } else {
5626     $self->{state} = DATA_STATE;
5627     $self->{s_kwd} = '';
5628     }
5629 wakaba 1.8
5630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5631     $self->{line_prev} = $self->{line};
5632     $self->{column_prev} = $self->{column};
5633     $self->{column}++;
5634     $self->{nc}
5635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5636     } else {
5637     $self->{set_nc}->($self);
5638     }
5639    
5640     return ($self->{ct}); # pi
5641     redo A;
5642     } elsif ($self->{nc} == 0x003F) { # ?
5643     $self->{ct}->{data} .= '?';
5644     ## Stay in the state.
5645    
5646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5647     $self->{line_prev} = $self->{line};
5648     $self->{column_prev} = $self->{column};
5649     $self->{column}++;
5650     $self->{nc}
5651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5652     } else {
5653     $self->{set_nc}->($self);
5654     }
5655    
5656     redo A;
5657     } else {
5658     $self->{ct}->{data} .= '?'; ## XML5: not appended
5659     $self->{state} = PI_DATA_STATE;
5660     ## Reprocess.
5661     redo A;
5662     }
5663 wakaba 1.12
5664     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5665     if ($self->{nc} == 0x003C) { # <
5666 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5667 wakaba 1.12
5668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669     $self->{line_prev} = $self->{line};
5670     $self->{column_prev} = $self->{column};
5671     $self->{column}++;
5672     $self->{nc}
5673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674     } else {
5675     $self->{set_nc}->($self);
5676     }
5677    
5678     redo A;
5679     } elsif ($self->{nc} == 0x0025) { # %
5680     ## XML5: Not defined yet.
5681    
5682     ## TODO:
5683 wakaba 1.24
5684     if (not $self->{stop_processing} and
5685     not $self->{document}->xml_standalone) {
5686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5687     level => $self->{level}->{info});
5688     $self->{stop_processing} = 1;
5689     }
5690    
5691 wakaba 1.12
5692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693     $self->{line_prev} = $self->{line};
5694     $self->{column_prev} = $self->{column};
5695     $self->{column}++;
5696     $self->{nc}
5697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698     } else {
5699     $self->{set_nc}->($self);
5700     }
5701    
5702     redo A;
5703     } elsif ($self->{nc} == 0x005D) { # ]
5704 wakaba 1.13 delete $self->{in_subset};
5705 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5706    
5707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708     $self->{line_prev} = $self->{line};
5709     $self->{column_prev} = $self->{column};
5710     $self->{column}++;
5711     $self->{nc}
5712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713     } else {
5714     $self->{set_nc}->($self);
5715     }
5716    
5717     redo A;
5718     } elsif ($is_space->{$self->{nc}}) {
5719     ## Stay in the state.
5720    
5721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5722     $self->{line_prev} = $self->{line};
5723     $self->{column_prev} = $self->{column};
5724     $self->{column}++;
5725     $self->{nc}
5726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5727     } else {
5728     $self->{set_nc}->($self);
5729     }
5730    
5731     redo A;
5732     } elsif ($self->{nc} == -1) {
5733     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5734 wakaba 1.13 delete $self->{in_subset};
5735 wakaba 1.12 $self->{state} = DATA_STATE;
5736     $self->{s_kwd} = '';
5737     ## Reconsume.
5738 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5739 wakaba 1.12 redo A;
5740     } else {
5741     unless ($self->{internal_subset_tainted}) {
5742     ## XML5: No parse error.
5743     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5744     $self->{internal_subset_tainted} = 1;
5745     }
5746     ## Stay in the state.
5747    
5748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749     $self->{line_prev} = $self->{line};
5750     $self->{column_prev} = $self->{column};
5751     $self->{column}++;
5752     $self->{nc}
5753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754     } else {
5755     $self->{set_nc}->($self);
5756     }
5757    
5758     redo A;
5759     }
5760     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5761     if ($self->{nc} == 0x003E) { # >
5762     $self->{state} = DATA_STATE;
5763     $self->{s_kwd} = '';
5764    
5765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5766     $self->{line_prev} = $self->{line};
5767     $self->{column_prev} = $self->{column};
5768     $self->{column}++;
5769     $self->{nc}
5770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5771     } else {
5772     $self->{set_nc}->($self);
5773     }
5774    
5775 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5776 wakaba 1.12 redo A;
5777     } elsif ($self->{nc} == -1) {
5778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5779     $self->{state} = DATA_STATE;
5780     $self->{s_kwd} = '';
5781     ## Reconsume.
5782 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5783 wakaba 1.12 redo A;
5784     } else {
5785     ## XML5: No parse error and stay in the state.
5786     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5787    
5788 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5789    
5790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5791     $self->{line_prev} = $self->{line};
5792     $self->{column_prev} = $self->{column};
5793     $self->{column}++;
5794     $self->{nc}
5795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5796     } else {
5797     $self->{set_nc}->($self);
5798     }
5799    
5800     redo A;
5801     }
5802     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5803     if ($self->{nc} == 0x003E) { # >
5804     $self->{state} = DATA_STATE;
5805     $self->{s_kwd} = '';
5806    
5807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808     $self->{line_prev} = $self->{line};
5809     $self->{column_prev} = $self->{column};
5810     $self->{column}++;
5811     $self->{nc}
5812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813     } else {
5814     $self->{set_nc}->($self);
5815     }
5816    
5817     return ({type => END_OF_DOCTYPE_TOKEN});
5818     redo A;
5819     } elsif ($self->{nc} == -1) {
5820     $self->{state} = DATA_STATE;
5821     $self->{s_kwd} = '';
5822     ## Reconsume.
5823     return ({type => END_OF_DOCTYPE_TOKEN});
5824     redo A;
5825     } else {
5826     ## Stay in the state.
5827    
5828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5829     $self->{line_prev} = $self->{line};
5830     $self->{column_prev} = $self->{column};
5831     $self->{column}++;
5832     $self->{nc}
5833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5834     } else {
5835     $self->{set_nc}->($self);
5836     }
5837    
5838     redo A;
5839     }
5840     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5841     if ($self->{nc} == 0x0021) { # !
5842 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5843 wakaba 1.13
5844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845     $self->{line_prev} = $self->{line};
5846     $self->{column_prev} = $self->{column};
5847     $self->{column}++;
5848     $self->{nc}
5849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850     } else {
5851     $self->{set_nc}->($self);
5852     }
5853    
5854     redo A;
5855     } elsif ($self->{nc} == 0x003F) { # ?
5856     $self->{state} = PI_STATE;
5857    
5858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5859     $self->{line_prev} = $self->{line};
5860     $self->{column_prev} = $self->{column};
5861     $self->{column}++;
5862     $self->{nc}
5863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5864     } else {
5865     $self->{set_nc}->($self);
5866     }
5867    
5868     redo A;
5869     } elsif ($self->{nc} == -1) {
5870     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5871     $self->{state} = DATA_STATE;
5872     $self->{s_kwd} = '';
5873     ## Reconsume.
5874     redo A;
5875     } else {
5876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5877     line => $self->{line_prev},
5878     column => $self->{column_prev});
5879     $self->{state} = BOGUS_COMMENT_STATE;
5880     $self->{ct} = {type => COMMENT_TOKEN,
5881     data => '',
5882     }; ## NOTE: Will be discarded.
5883 wakaba 1.12
5884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885     $self->{line_prev} = $self->{line};
5886     $self->{column_prev} = $self->{column};
5887     $self->{column}++;
5888     $self->{nc}
5889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890     } else {
5891     $self->{set_nc}->($self);
5892     }
5893    
5894     redo A;
5895     }
5896 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5897     ## XML5: "DOCTYPE markup declaration state".
5898    
5899     if ($self->{nc} == 0x002D) { # -
5900     $self->{state} = MD_HYPHEN_STATE;
5901    
5902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5903     $self->{line_prev} = $self->{line};
5904     $self->{column_prev} = $self->{column};
5905     $self->{column}++;
5906     $self->{nc}
5907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5908     } else {
5909     $self->{set_nc}->($self);
5910     }
5911    
5912     redo A;
5913 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5914     $self->{nc} == 0x0065) { # e
5915 wakaba 1.14 $self->{state} = MD_E_STATE;
5916     $self->{kwd} = chr $self->{nc};
5917    
5918     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5919     $self->{line_prev} = $self->{line};
5920     $self->{column_prev} = $self->{column};
5921     $self->{column}++;
5922     $self->{nc}
5923     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5924     } else {
5925     $self->{set_nc}->($self);
5926     }
5927    
5928     redo A;
5929 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5930     $self->{nc} == 0x0061) { # a
5931 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5932     $self->{kwd} = chr $self->{nc};
5933    
5934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5935     $self->{line_prev} = $self->{line};
5936     $self->{column_prev} = $self->{column};
5937     $self->{column}++;
5938     $self->{nc}
5939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5940     } else {
5941     $self->{set_nc}->($self);
5942     }
5943    
5944     redo A;
5945 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5946     $self->{nc} == 0x006E) { # n
5947 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5948     $self->{kwd} = chr $self->{nc};
5949    
5950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5951     $self->{line_prev} = $self->{line};
5952     $self->{column_prev} = $self->{column};
5953     $self->{column}++;
5954     $self->{nc}
5955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5956     } else {
5957     $self->{set_nc}->($self);
5958     }
5959    
5960     redo A;
5961     } else {
5962     #
5963     }
5964    
5965     ## XML5: No parse error.
5966     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5967     line => $self->{line_prev},
5968     column => $self->{column_prev} - 1);
5969     ## Reconsume.
5970     $self->{state} = BOGUS_COMMENT_STATE;
5971     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5972     redo A;
5973     } elsif ($self->{state} == MD_E_STATE) {
5974 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5975     $self->{nc} == 0x006E) { # n
5976 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5977     $self->{kwd} .= chr $self->{nc};
5978    
5979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980     $self->{line_prev} = $self->{line};
5981     $self->{column_prev} = $self->{column};
5982     $self->{column}++;
5983     $self->{nc}
5984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985     } else {
5986     $self->{set_nc}->($self);
5987     }
5988    
5989     redo A;
5990 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5991     $self->{nc} == 0x006C) { # l
5992 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5993     $self->{state} = MD_ELEMENT_STATE;
5994     $self->{kwd} .= chr $self->{nc};
5995    
5996     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5997     $self->{line_prev} = $self->{line};
5998     $self->{column_prev} = $self->{column};
5999     $self->{column}++;
6000     $self->{nc}
6001     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6002     } else {
6003     $self->{set_nc}->($self);
6004     }
6005    
6006     redo A;
6007     } else {
6008     ## XML5: No parse error.
6009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6010     line => $self->{line_prev},
6011     column => $self->{column_prev} - 2
6012     + 1 * ($self->{nc} == -1));
6013     ## Reconsume.
6014     $self->{state} = BOGUS_COMMENT_STATE;
6015     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6016     redo A;
6017     }
6018     } elsif ($self->{state} == MD_ENTITY_STATE) {
6019 wakaba 1.17 if ($self->{nc} == [
6020     undef,
6021     undef,
6022     0x0054, # T
6023     0x0049, # I
6024     0x0054, # T
6025     ]->[length $self->{kwd}] or
6026     $self->{nc} == [
6027     undef,
6028     undef,
6029     0x0074, # t
6030     0x0069, # i
6031     0x0074, # t
6032     ]->[length $self->{kwd}]) {
6033 wakaba 1.14 ## Stay in the state.
6034     $self->{kwd} .= chr $self->{nc};
6035    
6036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6037     $self->{line_prev} = $self->{line};
6038     $self->{column_prev} = $self->{column};
6039     $self->{column}++;
6040     $self->{nc}
6041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6042     } else {
6043     $self->{set_nc}->($self);
6044     }
6045    
6046     redo A;
6047 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
6048     ($self->{nc} == 0x0059 or # Y
6049     $self->{nc} == 0x0079)) { # y
6050     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6052     text => 'ENTITY',
6053     line => $self->{line_prev},
6054     column => $self->{column_prev} - 4);
6055     }
6056     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6057 wakaba 1.14 line => $self->{line_prev},
6058     column => $self->{column_prev} - 6};
6059     $self->{state} = DOCTYPE_MD_STATE;
6060    
6061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6062     $self->{line_prev} = $self->{line};
6063     $self->{column_prev} = $self->{column};
6064     $self->{column}++;
6065     $self->{nc}
6066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6067     } else {
6068     $self->{set_nc}->($self);
6069     }
6070    
6071     redo A;
6072     } else {
6073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6074     line => $self->{line_prev},
6075     column => $self->{column_prev} - 1
6076     - (length $self->{kwd})
6077     + 1 * ($self->{nc} == -1));
6078     $self->{state} = BOGUS_COMMENT_STATE;
6079     ## Reconsume.
6080     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6081     redo A;
6082     }
6083     } elsif ($self->{state} == MD_ELEMENT_STATE) {
6084 wakaba 1.17 if ($self->{nc} == [
6085     undef,
6086     undef,
6087     0x0045, # E
6088     0x004D, # M
6089     0x0045, # E
6090     0x004E, # N
6091     ]->[length $self->{kwd}] or
6092     $self->{nc} == [
6093     undef,
6094     undef,
6095     0x0065, # e
6096     0x006D, # m
6097     0x0065, # e
6098     0x006E, # n
6099     ]->[length $self->{kwd}]) {
6100 wakaba 1.14 ## Stay in the state.
6101     $self->{kwd} .= chr $self->{nc};
6102    
6103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6104     $self->{line_prev} = $self->{line};
6105     $self->{column_prev} = $self->{column};
6106     $self->{column}++;
6107     $self->{nc}
6108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6109     } else {
6110     $self->{set_nc}->($self);
6111     }
6112    
6113     redo A;
6114 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6115     ($self->{nc} == 0x0054 or # T
6116     $self->{nc} == 0x0074)) { # t
6117     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6119     text => 'ELEMENT',
6120     line => $self->{line_prev},
6121     column => $self->{column_prev} - 5);
6122     }
6123 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6124     line => $self->{line_prev},
6125 wakaba 1.23 column => $self->{column_prev} - 7};
6126 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6127    
6128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129     $self->{line_prev} = $self->{line};
6130     $self->{column_prev} = $self->{column};
6131     $self->{column}++;
6132     $self->{nc}
6133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134     } else {
6135     $self->{set_nc}->($self);
6136     }
6137    
6138     redo A;
6139     } else {
6140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6141     line => $self->{line_prev},
6142     column => $self->{column_prev} - 1
6143     - (length $self->{kwd})
6144     + 1 * ($self->{nc} == -1));
6145     $self->{state} = BOGUS_COMMENT_STATE;
6146     ## Reconsume.
6147     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6148     redo A;
6149     }
6150     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6151 wakaba 1.17 if ($self->{nc} == [
6152     undef,
6153     0x0054, # T
6154     0x0054, # T
6155     0x004C, # L
6156     0x0049, # I
6157     0x0053, # S
6158     ]->[length $self->{kwd}] or
6159     $self->{nc} == [
6160     undef,
6161     0x0074, # t
6162     0x0074, # t
6163     0x006C, # l
6164     0x0069, # i
6165     0x0073, # s
6166     ]->[length $self->{kwd}]) {
6167 wakaba 1.14 ## Stay in the state.
6168     $self->{kwd} .= chr $self->{nc};
6169    
6170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6171     $self->{line_prev} = $self->{line};
6172     $self->{column_prev} = $self->{column};
6173     $self->{column}++;
6174     $self->{nc}
6175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6176     } else {
6177     $self->{set_nc}->($self);
6178     }
6179    
6180     redo A;
6181 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6182     ($self->{nc} == 0x0054 or # T
6183     $self->{nc} == 0x0074)) { # t
6184     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6185     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6186     text => 'ATTLIST',
6187     line => $self->{line_prev},
6188     column => $self->{column_prev} - 5);
6189     }
6190 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6191 wakaba 1.15 attrdefs => [],
6192 wakaba 1.14 line => $self->{line_prev},
6193 wakaba 1.23 column => $self->{column_prev} - 7};
6194 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6195    
6196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6197     $self->{line_prev} = $self->{line};
6198     $self->{column_prev} = $self->{column};
6199     $self->{column}++;
6200     $self->{nc}
6201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6202     } else {
6203     $self->{set_nc}->($self);
6204     }
6205    
6206     redo A;
6207     } else {
6208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6209     line => $self->{line_prev},
6210     column => $self->{column_prev} - 1
6211     - (length $self->{kwd})
6212     + 1 * ($self->{nc} == -1));
6213     $self->{state} = BOGUS_COMMENT_STATE;
6214     ## Reconsume.
6215     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6216     redo A;
6217     }
6218     } elsif ($self->{state} == MD_NOTATION_STATE) {
6219 wakaba 1.17 if ($self->{nc} == [
6220     undef,
6221     0x004F, # O
6222     0x0054, # T
6223     0x0041, # A
6224     0x0054, # T
6225     0x0049, # I
6226     0x004F, # O
6227     ]->[length $self->{kwd}] or
6228     $self->{nc} == [
6229     undef,
6230     0x006F, # o
6231     0x0074, # t
6232     0x0061, # a
6233     0x0074, # t
6234     0x0069, # i
6235     0x006F, # o
6236     ]->[length $self->{kwd}]) {
6237 wakaba 1.14 ## Stay in the state.
6238     $self->{kwd} .= chr $self->{nc};
6239    
6240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6241     $self->{line_prev} = $self->{line};
6242     $self->{column_prev} = $self->{column};
6243     $self->{column}++;
6244     $self->{nc}
6245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6246     } else {
6247     $self->{set_nc}->($self);
6248     }
6249    
6250     redo A;
6251 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6252     ($self->{nc} == 0x004E or # N
6253     $self->{nc} == 0x006E)) { # n
6254     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6256     text => 'NOTATION',
6257     line => $self->{line_prev},
6258     column => $self->{column_prev} - 6);
6259     }
6260 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6261     line => $self->{line_prev},
6262 wakaba 1.23 column => $self->{column_prev} - 8};
6263 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6264    
6265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6266     $self->{line_prev} = $self->{line};
6267     $self->{column_prev} = $self->{column};
6268     $self->{column}++;
6269     $self->{nc}
6270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6271     } else {
6272     $self->{set_nc}->($self);
6273     }
6274    
6275     redo A;
6276     } else {
6277     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6278     line => $self->{line_prev},
6279     column => $self->{column_prev} - 1
6280     - (length $self->{kwd})
6281     + 1 * ($self->{nc} == -1));
6282     $self->{state} = BOGUS_COMMENT_STATE;
6283     ## Reconsume.
6284     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6285     redo A;
6286     }
6287     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6288     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6289     ## "DOCTYPE NOTATION state".
6290    
6291     if ($is_space->{$self->{nc}}) {
6292     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6293     $self->{state} = BEFORE_MD_NAME_STATE;
6294    
6295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6296     $self->{line_prev} = $self->{line};
6297     $self->{column_prev} = $self->{column};
6298     $self->{column}++;
6299     $self->{nc}
6300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6301     } else {
6302     $self->{set_nc}->($self);
6303     }
6304    
6305     redo A;
6306     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6307     $self->{nc} == 0x0025) { # %
6308     ## XML5: Switch to the "DOCTYPE bogus comment state".
6309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6310     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6311    
6312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313     $self->{line_prev} = $self->{line};
6314     $self->{column_prev} = $self->{column};
6315     $self->{column}++;
6316     $self->{nc}
6317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318     } else {
6319     $self->{set_nc}->($self);
6320     }
6321    
6322     redo A;
6323     } elsif ($self->{nc} == -1) {
6324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6325     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6326     ## Reconsume.
6327     redo A;
6328     } elsif ($self->{nc} == 0x003E) { # >
6329     ## XML5: Switch to the "DOCTYPE bogus comment state".
6330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6332    
6333     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6334     $self->{line_prev} = $self->{line};
6335     $self->{column_prev} = $self->{column};
6336     $self->{column}++;
6337     $self->{nc}
6338     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6339     } else {
6340     $self->{set_nc}->($self);
6341     }
6342    
6343     redo A;
6344     } else {
6345     ## XML5: Switch to the "DOCTYPE bogus comment state".
6346     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6347     $self->{state} = BEFORE_MD_NAME_STATE;
6348     redo A;
6349     }
6350     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6351     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6352     ## before state", "DOCTYPE ATTLIST name before state".
6353    
6354     if ($is_space->{$self->{nc}}) {
6355     ## Stay in the state.
6356    
6357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6358     $self->{line_prev} = $self->{line};
6359     $self->{column_prev} = $self->{column};
6360     $self->{column}++;
6361     $self->{nc}
6362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6363     } else {
6364     $self->{set_nc}->($self);
6365     }
6366    
6367     redo A;
6368     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6369     $self->{nc} == 0x0025) { # %
6370     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6371    
6372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373     $self->{line_prev} = $self->{line};
6374     $self->{column_prev} = $self->{column};
6375     $self->{column}++;
6376     $self->{nc}
6377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378     } else {
6379     $self->{set_nc}->($self);
6380     }
6381    
6382     redo A;
6383     } elsif ($self->{nc} == 0x003E) { # >
6384     ## XML5: Same as "Anything else".
6385     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6386     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6387    
6388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6389     $self->{line_prev} = $self->{line};
6390     $self->{column_prev} = $self->{column};
6391     $self->{column}++;
6392     $self->{nc}
6393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6394     } else {
6395     $self->{set_nc}->($self);
6396     }
6397    
6398     redo A;
6399     } elsif ($self->{nc} == -1) {
6400     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6402     ## Reconsume.
6403     redo A;
6404     } else {
6405     ## XML5: [ATTLIST] Not defined yet.
6406     $self->{ct}->{name} .= chr $self->{nc};
6407     $self->{state} = MD_NAME_STATE;
6408    
6409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6410     $self->{line_prev} = $self->{line};
6411     $self->{column_prev} = $self->{column};
6412     $self->{column}++;
6413     $self->{nc}
6414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6415     } else {
6416     $self->{set_nc}->($self);
6417     }
6418    
6419     redo A;
6420     }
6421     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6422     if ($is_space->{$self->{nc}}) {
6423     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6424     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6425     $self->{state} = BEFORE_MD_NAME_STATE;
6426 wakaba 1.8
6427 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428     $self->{line_prev} = $self->{line};
6429     $self->{column_prev} = $self->{column};
6430     $self->{column}++;
6431     $self->{nc}
6432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433     } else {
6434     $self->{set_nc}->($self);
6435     }
6436    
6437     redo A;
6438     } elsif ($self->{nc} == 0x003E) { # >
6439     ## XML5: Same as "Anything else".
6440     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6441     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6442    
6443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444     $self->{line_prev} = $self->{line};
6445     $self->{column_prev} = $self->{column};
6446     $self->{column}++;
6447     $self->{nc}
6448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449     } else {
6450     $self->{set_nc}->($self);
6451     }
6452    
6453     redo A;
6454     } elsif ($self->{nc} == -1) {
6455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6456     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6457     ## Reconsume.
6458     redo A;
6459     } else {
6460     ## XML5: No parse error.
6461     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6462     $self->{state} = BOGUS_COMMENT_STATE;
6463     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6464     ## Reconsume.
6465     redo A;
6466     }
6467     } elsif ($self->{state} == MD_NAME_STATE) {
6468     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6469    
6470     if ($is_space->{$self->{nc}}) {
6471 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6472     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6473     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6474 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6475 wakaba 1.16 } else { # ENTITY/NOTATION
6476     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6477     }
6478 wakaba 1.14
6479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480     $self->{line_prev} = $self->{line};
6481     $self->{column_prev} = $self->{column};
6482     $self->{column}++;
6483     $self->{nc}
6484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485     } else {
6486     $self->{set_nc}->($self);
6487     }
6488    
6489     redo A;
6490     } elsif ($self->{nc} == 0x003E) { # >
6491     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6492     #
6493     } else {
6494 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6495 wakaba 1.14 }
6496     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6497    
6498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6499     $self->{line_prev} = $self->{line};
6500     $self->{column_prev} = $self->{column};
6501     $self->{column}++;
6502     $self->{nc}
6503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6504     } else {
6505     $self->{set_nc}->($self);
6506     }
6507    
6508     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6509     redo A;
6510     } elsif ($self->{nc} == -1) {
6511     ## XML5: [ATTLIST] No parse error.
6512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6514     ## Reconsume.
6515     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6516     redo A;
6517     } else {
6518     ## XML5: [ATTLIST] Not defined yet.
6519     $self->{ct}->{name} .= chr $self->{nc};
6520     ## Stay in the state.
6521    
6522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523     $self->{line_prev} = $self->{line};
6524     $self->{column_prev} = $self->{column};
6525     $self->{column}++;
6526     $self->{nc}
6527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528     } else {
6529     $self->{set_nc}->($self);
6530     }
6531    
6532     redo A;
6533     }
6534     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6535     if ($is_space->{$self->{nc}}) {
6536     ## Stay in the state.
6537    
6538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539     $self->{line_prev} = $self->{line};
6540     $self->{column_prev} = $self->{column};
6541     $self->{column}++;
6542     $self->{nc}
6543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544     } else {
6545     $self->{set_nc}->($self);
6546     }
6547    
6548     redo A;
6549     } elsif ($self->{nc} == 0x003E) { # >
6550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6551    
6552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6553     $self->{line_prev} = $self->{line};
6554     $self->{column_prev} = $self->{column};
6555     $self->{column}++;
6556     $self->{nc}
6557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6558     } else {
6559     $self->{set_nc}->($self);
6560     }
6561    
6562     return ($self->{ct}); # ATTLIST
6563     redo A;
6564     } elsif ($self->{nc} == -1) {
6565     ## XML5: No parse error.
6566     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6567     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6568 wakaba 1.15 return ($self->{ct});
6569 wakaba 1.14 redo A;
6570     } else {
6571     ## XML5: Not defined yet.
6572 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6573     tokens => [],
6574     line => $self->{line}, column => $self->{column}};
6575     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6576    
6577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6578     $self->{line_prev} = $self->{line};
6579     $self->{column_prev} = $self->{column};
6580     $self->{column}++;
6581     $self->{nc}
6582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6583     } else {
6584     $self->{set_nc}->($self);
6585     }
6586    
6587     redo A;
6588     }
6589     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6590     if ($is_space->{$self->{nc}}) {
6591     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6592    
6593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6594     $self->{line_prev} = $self->{line};
6595     $self->{column_prev} = $self->{column};
6596     $self->{column}++;
6597     $self->{nc}
6598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6599     } else {
6600     $self->{set_nc}->($self);
6601     }
6602    
6603     redo A;
6604     } elsif ($self->{nc} == 0x003E) { # >
6605     ## XML5: Same as "anything else".
6606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6607     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6608    
6609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610     $self->{line_prev} = $self->{line};
6611     $self->{column_prev} = $self->{column};
6612     $self->{column}++;
6613     $self->{nc}
6614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615     } else {
6616     $self->{set_nc}->($self);
6617     }
6618    
6619     return ($self->{ct}); # ATTLIST
6620     redo A;
6621     } elsif ($self->{nc} == 0x0028) { # (
6622     ## XML5: Same as "anything else".
6623     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6624     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6625    
6626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6627     $self->{line_prev} = $self->{line};
6628     $self->{column_prev} = $self->{column};
6629     $self->{column}++;
6630     $self->{nc}
6631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6632     } else {
6633     $self->{set_nc}->($self);
6634     }
6635    
6636     redo A;
6637     } elsif ($self->{nc} == -1) {
6638     ## XML5: No parse error.
6639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6640     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6641    
6642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6643     $self->{line_prev} = $self->{line};
6644     $self->{column_prev} = $self->{column};
6645     $self->{column}++;
6646     $self->{nc}
6647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6648     } else {
6649     $self->{set_nc}->($self);
6650     }
6651    
6652     return ($self->{ct}); # ATTLIST
6653     redo A;
6654     } else {
6655     ## XML5: Not defined yet.
6656     $self->{ca}->{name} .= chr $self->{nc};
6657     ## Stay in the state.
6658    
6659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6660     $self->{line_prev} = $self->{line};
6661     $self->{column_prev} = $self->{column};
6662     $self->{column}++;
6663     $self->{nc}
6664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6665     } else {
6666     $self->{set_nc}->($self);
6667     }
6668    
6669 wakaba 1.14 redo A;
6670     }
6671 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6672     if ($is_space->{$self->{nc}}) {
6673     ## Stay in the state.
6674    
6675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6676     $self->{line_prev} = $self->{line};
6677     $self->{column_prev} = $self->{column};
6678     $self->{column}++;
6679     $self->{nc}
6680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6681     } else {
6682     $self->{set_nc}->($self);
6683     }
6684    
6685     redo A;
6686     } elsif ($self->{nc} == 0x003E) { # >
6687     ## XML5: Same as "anything else".
6688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6689     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6690    
6691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6692     $self->{line_prev} = $self->{line};
6693     $self->{column_prev} = $self->{column};
6694     $self->{column}++;
6695     $self->{nc}
6696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6697     } else {
6698     $self->{set_nc}->($self);
6699     }
6700    
6701     return ($self->{ct}); # ATTLIST
6702     redo A;
6703     } elsif ($self->{nc} == 0x0028) { # (
6704     ## XML5: Same as "anything else".
6705     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6706    
6707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6708     $self->{line_prev} = $self->{line};
6709     $self->{column_prev} = $self->{column};
6710     $self->{column}++;
6711     $self->{nc}
6712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6713     } else {
6714     $self->{set_nc}->($self);
6715     }
6716    
6717     redo A;
6718     } elsif ($self->{nc} == -1) {
6719     ## XML5: No parse error.
6720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6721     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6722    
6723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6724     $self->{line_prev} = $self->{line};
6725     $self->{column_prev} = $self->{column};
6726     $self->{column}++;
6727     $self->{nc}
6728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6729     } else {
6730     $self->{set_nc}->($self);
6731     }
6732    
6733     return ($self->{ct});
6734     redo A;
6735     } else {
6736     ## XML5: Not defined yet.
6737     $self->{ca}->{type} = chr $self->{nc};
6738     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6739    
6740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6741     $self->{line_prev} = $self->{line};
6742     $self->{column_prev} = $self->{column};
6743     $self->{column}++;
6744     $self->{nc}
6745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6746     } else {
6747     $self->{set_nc}->($self);
6748     }
6749    
6750     redo A;
6751     }
6752     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6753     if ($is_space->{$self->{nc}}) {
6754     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6755    
6756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6757     $self->{line_prev} = $self->{line};
6758     $self->{column_prev} = $self->{column};
6759     $self->{column}++;
6760     $self->{nc}
6761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6762     } else {
6763     $self->{set_nc}->($self);
6764     }
6765    
6766     redo A;
6767     } elsif ($self->{nc} == 0x0023) { # #
6768     ## XML5: Same as "anything else".
6769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6770     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6771    
6772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773     $self->{line_prev} = $self->{line};
6774     $self->{column_prev} = $self->{column};
6775     $self->{column}++;
6776     $self->{nc}
6777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778     } else {
6779     $self->{set_nc}->($self);
6780     }
6781    
6782     redo A;
6783     } elsif ($self->{nc} == 0x0022) { # "
6784     ## XML5: Same as "anything else".
6785     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6786     $self->{ca}->{value} = '';
6787     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6788    
6789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790     $self->{line_prev} = $self->{line};
6791     $self->{column_prev} = $self->{column};
6792     $self->{column}++;
6793     $self->{nc}
6794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795     } else {
6796     $self->{set_nc}->($self);
6797     }
6798    
6799     redo A;
6800     } elsif ($self->{nc} == 0x0027) { # '
6801     ## XML5: Same as "anything else".
6802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6803     $self->{ca}->{value} = '';
6804     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6805    
6806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6807     $self->{line_prev} = $self->{line};
6808     $self->{column_prev} = $self->{column};
6809     $self->{column}++;
6810     $self->{nc}
6811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6812     } else {
6813     $self->{set_nc}->($self);
6814     }
6815    
6816     redo A;
6817     } elsif ($self->{nc} == 0x003E) { # >
6818     ## XML5: Same as "anything else".
6819     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6820     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6821    
6822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6823     $self->{line_prev} = $self->{line};
6824     $self->{column_prev} = $self->{column};
6825     $self->{column}++;
6826     $self->{nc}
6827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6828     } else {
6829     $self->{set_nc}->($self);
6830     }
6831    
6832     return ($self->{ct}); # ATTLIST
6833     redo A;
6834     } elsif ($self->{nc} == 0x0028) { # (
6835     ## XML5: Same as "anything else".
6836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6837     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6838    
6839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6840     $self->{line_prev} = $self->{line};
6841     $self->{column_prev} = $self->{column};
6842     $self->{column}++;
6843     $self->{nc}
6844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6845     } else {
6846     $self->{set_nc}->($self);
6847     }
6848    
6849     redo A;
6850     } elsif ($self->{nc} == -1) {
6851     ## XML5: No parse error.
6852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6853     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6854    
6855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6856     $self->{line_prev} = $self->{line};
6857     $self->{column_prev} = $self->{column};
6858     $self->{column}++;
6859     $self->{nc}
6860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6861     } else {
6862     $self->{set_nc}->($self);
6863     }
6864    
6865     return ($self->{ct});
6866     redo A;
6867     } else {
6868     ## XML5: Not defined yet.
6869     $self->{ca}->{type} .= chr $self->{nc};
6870     ## Stay in the state.
6871    
6872     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6873     $self->{line_prev} = $self->{line};
6874     $self->{column_prev} = $self->{column};
6875     $self->{column}++;
6876     $self->{nc}
6877     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6878     } else {
6879     $self->{set_nc}->($self);
6880     }
6881    
6882     redo A;
6883     }
6884     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6885     if ($is_space->{$self->{nc}}) {
6886     ## Stay in the state.
6887    
6888     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6889     $self->{line_prev} = $self->{line};
6890     $self->{column_prev} = $self->{column};
6891     $self->{column}++;
6892     $self->{nc}
6893     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6894     } else {
6895     $self->{set_nc}->($self);
6896     }
6897    
6898     redo A;
6899     } elsif ($self->{nc} == 0x0028) { # (
6900     ## XML5: Same as "anything else".
6901     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6902    
6903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904     $self->{line_prev} = $self->{line};
6905     $self->{column_prev} = $self->{column};
6906     $self->{column}++;
6907     $self->{nc}
6908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909     } else {
6910     $self->{set_nc}->($self);
6911     }
6912    
6913     redo A;
6914     } elsif ($self->{nc} == 0x0023) { # #
6915     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6916    
6917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6918     $self->{line_prev} = $self->{line};
6919     $self->{column_prev} = $self->{column};
6920     $self->{column}++;
6921     $self->{nc}
6922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6923     } else {
6924     $self->{set_nc}->($self);
6925     }
6926    
6927     redo A;
6928     } elsif ($self->{nc} == 0x0022) { # "
6929     ## XML5: Same as "anything else".
6930     $self->{ca}->{value} = '';
6931     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6932    
6933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934     $self->{line_prev} = $self->{line};
6935     $self->{column_prev} = $self->{column};
6936     $self->{column}++;
6937     $self->{nc}
6938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939     } else {
6940     $self->{set_nc}->($self);
6941     }
6942    
6943     redo A;
6944     } elsif ($self->{nc} == 0x0027) { # '
6945     ## XML5: Same as "anything else".
6946     $self->{ca}->{value} = '';
6947     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6948    
6949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6950     $self->{line_prev} = $self->{line};
6951     $self->{column_prev} = $self->{column};
6952     $self->{column}++;
6953     $self->{nc}
6954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6955     } else {
6956     $self->{set_nc}->($self);
6957     }
6958    
6959     redo A;
6960     } elsif ($self->{nc} == 0x003E) { # >
6961     ## XML5: Same as "anything else".
6962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6963     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6964    
6965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966     $self->{line_prev} = $self->{line};
6967     $self->{column_prev} = $self->{column};
6968     $self->{column}++;
6969     $self->{nc}
6970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971     } else {
6972     $self->{set_nc}->($self);
6973     }
6974    
6975     return ($self->{ct}); # ATTLIST
6976     redo A;
6977     } elsif ($self->{nc} == -1) {
6978     ## XML5: No parse error.
6979     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6981    
6982     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6983     $self->{line_prev} = $self->{line};
6984     $self->{column_prev} = $self->{column};
6985     $self->{column}++;
6986     $self->{nc}
6987     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6988     } else {
6989     $self->{set_nc}->($self);
6990     }
6991    
6992     return ($self->{ct});
6993     redo A;
6994     } else {
6995     ## XML5: Switch to the "DOCTYPE bogus comment state".
6996     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6997     $self->{ca}->{value} = '';
6998     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6999     ## Reconsume.
7000     redo A;
7001     }
7002     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7003     if ($is_space->{$self->{nc}}) {
7004     ## Stay in the state.
7005    
7006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7007     $self->{line_prev} = $self->{line};
7008     $self->{column_prev} = $self->{column};
7009     $self->{column}++;
7010     $self->{nc}
7011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7012     } else {
7013     $self->{set_nc}->($self);
7014     }
7015    
7016     redo A;
7017     } elsif ($self->{nc} == 0x007C) { # |
7018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7019     ## Stay in the state.
7020    
7021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7022     $self->{line_prev} = $self->{line};
7023     $self->{column_prev} = $self->{column};
7024     $self->{column}++;
7025     $self->{nc}
7026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7027     } else {
7028     $self->{set_nc}->($self);
7029     }
7030    
7031     redo A;
7032     } elsif ($self->{nc} == 0x0029) { # )
7033     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7034     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7035    
7036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037     $self->{line_prev} = $self->{line};
7038     $self->{column_prev} = $self->{column};
7039     $self->{column}++;
7040     $self->{nc}
7041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042     } else {
7043     $self->{set_nc}->($self);
7044     }
7045    
7046     redo A;
7047     } elsif ($self->{nc} == 0x003E) { # >
7048     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7049     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7050    
7051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7052     $self->{line_prev} = $self->{line};
7053     $self->{column_prev} = $self->{column};
7054     $self->{column}++;
7055     $self->{nc}
7056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7057     } else {
7058     $self->{set_nc}->($self);
7059     }
7060    
7061     return ($self->{ct}); # ATTLIST
7062     redo A;
7063     } elsif ($self->{nc} == -1) {
7064     ## XML5: No parse error.
7065     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7066     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7067    
7068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069     $self->{line_prev} = $self->{line};
7070     $self->{column_prev} = $self->{column};
7071     $self->{column}++;
7072     $self->{nc}
7073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074     } else {
7075     $self->{set_nc}->($self);
7076     }
7077    
7078     return ($self->{ct});
7079     redo A;
7080     } else {
7081     push @{$self->{ca}->{tokens}}, chr $self->{nc};
7082     $self->{state} = ALLOWED_TOKEN_STATE;
7083    
7084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085     $self->{line_prev} = $self->{line};
7086     $self->{column_prev} = $self->{column};
7087     $self->{column}++;
7088     $self->{nc}
7089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090     } else {
7091     $self->{set_nc}->($self);
7092     }
7093    
7094     redo A;
7095     }
7096     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7097     if ($is_space->{$self->{nc}}) {
7098     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7099    
7100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101     $self->{line_prev} = $self->{line};
7102     $self->{column_prev} = $self->{column};
7103     $self->{column}++;
7104     $self->{nc}
7105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106     } else {
7107     $self->{set_nc}->($self);
7108     }
7109    
7110     redo A;
7111     } elsif ($self->{nc} == 0x007C) { # |
7112     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7113    
7114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115     $self->{line_prev} = $self->{line};
7116     $self->{column_prev} = $self->{column};
7117     $self->{column}++;
7118     $self->{nc}
7119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120     } else {
7121     $self->{set_nc}->($self);
7122     }
7123    
7124     redo A;
7125     } elsif ($self->{nc} == 0x0029) { # )
7126     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7127    
7128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129     $self->{line_prev} = $self->{line};
7130     $self->{column_prev} = $self->{column};
7131     $self->{column}++;
7132     $self->{nc}
7133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134     } else {
7135     $self->{set_nc}->($self);
7136     }
7137    
7138     redo A;
7139     } elsif ($self->{nc} == 0x003E) { # >
7140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7142    
7143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7144     $self->{line_prev} = $self->{line};
7145     $self->{column_prev} = $self->{column};
7146     $self->{column}++;
7147     $self->{nc}
7148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7149     } else {
7150     $self->{set_nc}->($self);
7151     }
7152    
7153     return ($self->{ct}); # ATTLIST
7154     redo A;
7155     } elsif ($self->{nc} == -1) {
7156     ## XML5: No parse error.
7157     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7158     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7159    
7160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7161     $self->{line_prev} = $self->{line};
7162     $self->{column_prev} = $self->{column};
7163     $self->{column}++;
7164     $self->{nc}
7165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7166     } else {
7167     $self->{set_nc}->($self);
7168     }
7169    
7170     return ($self->{ct});
7171     redo A;
7172     } else {
7173     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7174     ## Stay in the state.
7175    
7176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177     $self->{line_prev} = $self->{line};
7178     $self->{column_prev} = $self->{column};
7179     $self->{column}++;
7180     $self->{nc}
7181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182     } else {
7183     $self->{set_nc}->($self);
7184     }
7185    
7186     redo A;
7187     }
7188     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7189     if ($is_space->{$self->{nc}}) {
7190     ## Stay in the state.
7191    
7192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193     $self->{line_prev} = $self->{line};
7194     $self->{column_prev} = $self->{column};
7195     $self->{column}++;
7196     $self->{nc}
7197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198     } else {
7199     $self->{set_nc}->($self);
7200     }
7201    
7202     redo A;
7203     } elsif ($self->{nc} == 0x007C) { # |
7204     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7205    
7206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7207     $self->{line_prev} = $self->{line};
7208     $self->{column_prev} = $self->{column};
7209     $self->{column}++;
7210     $self->{nc}
7211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7212     } else {
7213     $self->{set_nc}->($self);
7214     }
7215    
7216     redo A;
7217     } elsif ($self->{nc} == 0x0029) { # )
7218     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7219    
7220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221     $self->{line_prev} = $self->{line};
7222     $self->{column_prev} = $self->{column};
7223     $self->{column}++;
7224     $self->{nc}
7225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226     } else {
7227     $self->{set_nc}->($self);
7228     }
7229    
7230     redo A;
7231     } elsif ($self->{nc} == 0x003E) { # >
7232     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7234    
7235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236     $self->{line_prev} = $self->{line};
7237     $self->{column_prev} = $self->{column};
7238     $self->{column}++;
7239     $self->{nc}
7240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241     } else {
7242     $self->{set_nc}->($self);
7243     }
7244    
7245     return ($self->{ct}); # ATTLIST
7246     redo A;
7247     } elsif ($self->{nc} == -1) {
7248     ## XML5: No parse error.
7249     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7250     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7251    
7252     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7253     $self->{line_prev} = $self->{line};
7254     $self->{column_prev} = $self->{column};
7255     $self->{column}++;
7256     $self->{nc}
7257     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7258     } else {
7259     $self->{set_nc}->($self);
7260     }
7261    
7262     return ($self->{ct});
7263     redo A;
7264     } else {
7265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7266     line => $self->{line_prev},
7267     column => $self->{column_prev});
7268     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7269     $self->{state} = ALLOWED_TOKEN_STATE;
7270    
7271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272     $self->{line_prev} = $self->{line};
7273     $self->{column_prev} = $self->{column};
7274     $self->{column}++;
7275     $self->{nc}
7276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277     } else {
7278     $self->{set_nc}->($self);
7279     }
7280    
7281     redo A;
7282     }
7283     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7284     if ($is_space->{$self->{nc}}) {
7285     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7286    
7287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288     $self->{line_prev} = $self->{line};
7289     $self->{column_prev} = $self->{column};
7290     $self->{column}++;
7291     $self->{nc}
7292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293     } else {
7294     $self->{set_nc}->($self);
7295     }
7296    
7297     redo A;
7298     } elsif ($self->{nc} == 0x0023) { # #
7299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7300     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7301    
7302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7303     $self->{line_prev} = $self->{line};
7304     $self->{column_prev} = $self->{column};
7305     $self->{column}++;
7306     $self->{nc}
7307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7308     } else {
7309     $self->{set_nc}->($self);
7310     }
7311    
7312     redo A;
7313     } elsif ($self->{nc} == 0x0022) { # "
7314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7315     $self->{ca}->{value} = '';
7316     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7317    
7318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7319     $self->{line_prev} = $self->{line};
7320     $self->{column_prev} = $self->{column};
7321     $self->{column}++;
7322     $self->{nc}
7323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7324     } else {
7325     $self->{set_nc}->($self);
7326     }
7327    
7328     redo A;
7329     } elsif ($self->{nc} == 0x0027) { # '
7330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7331     $self->{ca}->{value} = '';
7332     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7333    
7334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7335     $self->{line_prev} = $self->{line};
7336     $self->{column_prev} = $self->{column};
7337     $self->{column}++;
7338     $self->{nc}
7339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7340     } else {
7341     $self->{set_nc}->($self);
7342     }
7343    
7344     redo A;
7345     } elsif ($self->{nc} == 0x003E) { # >
7346     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7348    
7349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7350     $self->{line_prev} = $self->{line};
7351     $self->{column_prev} = $self->{column};
7352     $self->{column}++;
7353     $self->{nc}
7354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7355     } else {
7356     $self->{set_nc}->($self);
7357     }
7358    
7359     return ($self->{ct}); # ATTLIST
7360     redo A;
7361     } elsif ($self->{nc} == -1) {
7362     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7364    
7365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7366     $self->{line_prev} = $self->{line};
7367     $self->{column_prev} = $self->{column};
7368     $self->{column}++;
7369     $self->{nc}
7370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7371     } else {
7372     $self->{set_nc}->($self);
7373     }
7374    
7375     return ($self->{ct});
7376     redo A;
7377     } else {
7378     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7379     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7380     ## Reconsume.
7381     redo A;
7382     }
7383     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7384     if ($is_space->{$self->{nc}}) {
7385     ## Stay in the state.
7386    
7387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7388     $self->{line_prev} = $self->{line};
7389     $self->{column_prev} = $self->{column};
7390     $self->{column}++;
7391     $self->{nc}
7392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7393     } else {
7394     $self->{set_nc}->($self);
7395     }
7396    
7397     redo A;
7398     } elsif ($self->{nc} == 0x0023) { # #
7399     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7400    
7401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402     $self->{line_prev} = $self->{line};
7403     $self->{column_prev} = $self->{column};
7404     $self->{column}++;
7405     $self->{nc}
7406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407     } else {
7408     $self->{set_nc}->($self);
7409     }
7410    
7411     redo A;
7412     } elsif ($self->{nc} == 0x0022) { # "
7413     $self->{ca}->{value} = '';
7414     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7415    
7416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7417     $self->{line_prev} = $self->{line};
7418     $self->{column_prev} = $self->{column};
7419     $self->{column}++;
7420     $self->{nc}
7421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7422     } else {
7423     $self->{set_nc}->($self);
7424     }
7425    
7426     redo A;
7427     } elsif ($self->{nc} == 0x0027) { # '
7428     $self->{ca}->{value} = '';
7429     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7430    
7431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432     $self->{line_prev} = $self->{line};
7433     $self->{column_prev} = $self->{column};
7434     $self->{column}++;
7435     $self->{nc}
7436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437     } else {
7438     $self->{set_nc}->($self);
7439     }
7440    
7441     redo A;
7442     } elsif ($self->{nc} == 0x003E) { # >
7443     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7444     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7445    
7446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447     $self->{line_prev} = $self->{line};
7448     $self->{column_prev} = $self->{column};
7449     $self->{column}++;
7450     $self->{nc}
7451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452     } else {
7453     $self->{set_nc}->($self);
7454     }
7455    
7456     return ($self->{ct}); # ATTLIST
7457     redo A;
7458     } elsif ($self->{nc} == -1) {
7459     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7460     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7461    
7462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463     $self->{line_prev} = $self->{line};
7464     $self->{column_prev} = $self->{column};
7465     $self->{column}++;
7466     $self->{nc}
7467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468     } else {
7469     $self->{set_nc}->($self);
7470     }
7471    
7472     return ($self->{ct});
7473     redo A;
7474     } else {
7475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7476     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7477     ## Reconsume.
7478     redo A;
7479     }
7480     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7481     if ($is_space->{$self->{nc}}) {
7482     ## XML5: No parse error.
7483     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7484 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7485 wakaba 1.15 ## Reconsume.
7486     redo A;
7487     } elsif ($self->{nc} == 0x0022) { # "
7488     ## XML5: Same as "anything else".
7489     $self->{ca}->{value} = '';
7490     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7491    
7492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493     $self->{line_prev} = $self->{line};
7494     $self->{column_prev} = $self->{column};
7495     $self->{column}++;
7496     $self->{nc}
7497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498     } else {
7499     $self->{set_nc}->($self);
7500     }
7501    
7502     redo A;
7503     } elsif ($self->{nc} == 0x0027) { # '
7504     ## XML5: Same as "anything else".
7505     $self->{ca}->{value} = '';
7506     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7507    
7508     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7509     $self->{line_prev} = $self->{line};
7510     $self->{column_prev} = $self->{column};
7511     $self->{column}++;
7512     $self->{nc}
7513     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7514     } else {
7515     $self->{set_nc}->($self);
7516     }
7517    
7518     redo A;
7519     } elsif ($self->{nc} == 0x003E) { # >
7520     ## XML5: Same as "anything else".
7521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7522     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7523    
7524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7525     $self->{line_prev} = $self->{line};
7526     $self->{column_prev} = $self->{column};
7527     $self->{column}++;
7528     $self->{nc}
7529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7530     } else {
7531     $self->{set_nc}->($self);
7532     }
7533    
7534     return ($self->{ct}); # ATTLIST
7535     redo A;
7536     } elsif ($self->{nc} == -1) {
7537     ## XML5: No parse error.
7538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7540    
7541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7542     $self->{line_prev} = $self->{line};
7543     $self->{column_prev} = $self->{column};
7544     $self->{column}++;
7545     $self->{nc}
7546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7547     } else {
7548     $self->{set_nc}->($self);
7549     }
7550    
7551     return ($self->{ct});
7552     redo A;
7553     } else {
7554     $self->{ca}->{default} = chr $self->{nc};
7555     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7556    
7557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7558     $self->{line_prev} = $self->{line};
7559     $self->{column_prev} = $self->{column};
7560     $self->{column}++;
7561     $self->{nc}
7562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7563     } else {
7564     $self->{set_nc}->($self);
7565     }
7566    
7567     redo A;
7568     }
7569     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7570     if ($is_space->{$self->{nc}}) {
7571     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7572    
7573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7574     $self->{line_prev} = $self->{line};
7575     $self->{column_prev} = $self->{column};
7576     $self->{column}++;
7577     $self->{nc}
7578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7579     } else {
7580     $self->{set_nc}->($self);
7581     }
7582    
7583     redo A;
7584     } elsif ($self->{nc} == 0x0022) { # "
7585     ## XML5: Same as "anything else".
7586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7587     $self->{ca}->{value} = '';
7588     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7589    
7590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7591     $self->{line_prev} = $self->{line};
7592     $self->{column_prev} = $self->{column};
7593     $self->{column}++;
7594     $self->{nc}
7595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7596     } else {
7597     $self->{set_nc}->($self);
7598     }
7599    
7600     redo A;
7601     } elsif ($self->{nc} == 0x0027) { # '
7602     ## XML5: Same as "anything else".
7603     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7604     $self->{ca}->{value} = '';
7605     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7606    
7607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7608     $self->{line_prev} = $self->{line};
7609     $self->{column_prev} = $self->{column};
7610     $self->{column}++;
7611     $self->{nc}
7612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7613     } else {
7614     $self->{set_nc}->($self);
7615     }
7616    
7617     redo A;
7618     } elsif ($self->{nc} == 0x003E) { # >
7619     ## XML5: Same as "anything else".
7620     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7621     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7622    
7623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7624     $self->{line_prev} = $self->{line};
7625     $self->{column_prev} = $self->{column};
7626     $self->{column}++;
7627     $self->{nc}
7628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7629     } else {
7630     $self->{set_nc}->($self);
7631     }
7632    
7633     return ($self->{ct}); # ATTLIST
7634     redo A;
7635     } elsif ($self->{nc} == -1) {
7636     ## XML5: No parse error.
7637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7639     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7640    
7641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642     $self->{line_prev} = $self->{line};
7643     $self->{column_prev} = $self->{column};
7644     $self->{column}++;
7645     $self->{nc}
7646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647     } else {
7648     $self->{set_nc}->($self);
7649     }
7650    
7651     return ($self->{ct});
7652     redo A;
7653     } else {
7654     $self->{ca}->{default} .= chr $self->{nc};
7655     ## Stay in the state.
7656    
7657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7658     $self->{line_prev} = $self->{line};
7659     $self->{column_prev} = $self->{column};
7660     $self->{column}++;
7661     $self->{nc}
7662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7663     } else {
7664     $self->{set_nc}->($self);
7665     }
7666    
7667     redo A;
7668     }
7669     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7670     if ($is_space->{$self->{nc}}) {
7671     ## Stay in the state.
7672    
7673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7674     $self->{line_prev} = $self->{line};
7675     $self->{column_prev} = $self->{column};
7676     $self->{column}++;
7677     $self->{nc}
7678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7679     } else {
7680     $self->{set_nc}->($self);
7681     }
7682    
7683     redo A;
7684     } elsif ($self->{nc} == 0x0022) { # "
7685     $self->{ca}->{value} = '';
7686     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7687    
7688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7689     $self->{line_prev} = $self->{line};
7690     $self->{column_prev} = $self->{column};
7691     $self->{column}++;
7692     $self->{nc}
7693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7694     } else {
7695     $self->{set_nc}->($self);
7696     }
7697    
7698     redo A;
7699     } elsif ($self->{nc} == 0x0027) { # '
7700     $self->{ca}->{value} = '';
7701     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7702    
7703     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7704     $self->{line_prev} = $self->{line};
7705     $self->{column_prev} = $self->{column};
7706     $self->{column}++;
7707     $self->{nc}
7708     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7709     } else {
7710     $self->{set_nc}->($self);
7711     }
7712    
7713     redo A;
7714     } elsif ($self->{nc} == 0x003E) { # >
7715     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7716     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7717    
7718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7719     $self->{line_prev} = $self->{line};
7720     $self->{column_prev} = $self->{column};
7721     $self->{column}++;
7722     $self->{nc}
7723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7724     } else {
7725     $self->{set_nc}->($self);
7726     }
7727    
7728     return ($self->{ct}); # ATTLIST
7729     redo A;
7730     } elsif ($self->{nc} == -1) {
7731     ## XML5: No parse error.
7732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7733     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7734     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7735    
7736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7737     $self->{line_prev} = $self->{line};
7738     $self->{column_prev} = $self->{column};
7739     $self->{column}++;
7740     $self->{nc}
7741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7742     } else {
7743     $self->{set_nc}->($self);
7744     }
7745    
7746     return ($self->{ct});
7747     redo A;
7748     } else {
7749     ## XML5: Not defined yet.
7750     if ($self->{ca}->{default} eq 'FIXED') {
7751     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7752     } else {
7753     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7754     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7755     }
7756     ## Reconsume.
7757     redo A;
7758     }
7759     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7760     if ($is_space->{$self->{nc}} or
7761     $self->{nc} == -1 or
7762     $self->{nc} == 0x003E) { # >
7763     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7764     ## Reconsume.
7765     redo A;
7766     } else {
7767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7768     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7769     ## Reconsume.
7770     redo A;
7771 wakaba 1.16 }
7772 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7773     ## ASCII case-insensitive
7774     if ($self->{nc} == [
7775     undef,
7776     0x0044, # D
7777     0x0041, # A
7778     0x0054, # T
7779     ]->[length $self->{kwd}] or
7780     $self->{nc} == [
7781     undef,
7782     0x0064, # d
7783     0x0061, # a
7784     0x0074, # t
7785     ]->[length $self->{kwd}]) {
7786    
7787     ## Stay in the state.
7788     $self->{kwd} .= chr $self->{nc};
7789    
7790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7791     $self->{line_prev} = $self->{line};
7792     $self->{column_prev} = $self->{column};
7793     $self->{column}++;
7794     $self->{nc}
7795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7796     } else {
7797     $self->{set_nc}->($self);
7798     }
7799    
7800     redo A;
7801     } elsif ((length $self->{kwd}) == 4 and
7802     ($self->{nc} == 0x0041 or # A
7803     $self->{nc} == 0x0061)) { # a
7804     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7805    
7806     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7807     text => 'NDATA',
7808     line => $self->{line_prev},
7809     column => $self->{column_prev} - 4);
7810     } else {
7811    
7812     }
7813     $self->{state} = AFTER_NDATA_STATE;
7814    
7815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7816     $self->{line_prev} = $self->{line};
7817     $self->{column_prev} = $self->{column};
7818     $self->{column}++;
7819     $self->{nc}
7820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7821     } else {
7822     $self->{set_nc}->($self);
7823     }
7824    
7825     redo A;
7826     } else {
7827     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7828     line => $self->{line_prev},
7829     column => $self->{column_prev} + 1
7830     - length $self->{kwd});
7831    
7832     $self->{state} = BOGUS_MD_STATE;
7833     ## Reconsume.
7834     redo A;
7835     }
7836     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7837     if ($is_space->{$self->{nc}}) {
7838     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7839    
7840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841     $self->{line_prev} = $self->{line};
7842     $self->{column_prev} = $self->{column};
7843     $self->{column}++;
7844     $self->{nc}
7845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846     } else {
7847     $self->{set_nc}->($self);
7848     }
7849    
7850     redo A;
7851     } elsif ($self->{nc} == 0x003E) { # >
7852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7853     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7854    
7855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856     $self->{line_prev} = $self->{line};
7857     $self->{column_prev} = $self->{column};
7858     $self->{column}++;
7859     $self->{nc}
7860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861     } else {
7862     $self->{set_nc}->($self);
7863     }
7864    
7865     return ($self->{ct}); # ENTITY
7866     redo A;
7867     } elsif ($self->{nc} == -1) {
7868     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7869     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7870    
7871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872     $self->{line_prev} = $self->{line};
7873     $self->{column_prev} = $self->{column};
7874     $self->{column}++;
7875     $self->{nc}
7876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877     } else {
7878     $self->{set_nc}->($self);
7879     }
7880    
7881     return ($self->{ct}); # ENTITY
7882     redo A;
7883     } else {
7884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7885     line => $self->{line_prev},
7886     column => $self->{column_prev} + 1
7887     - length $self->{kwd});
7888     $self->{state} = BOGUS_MD_STATE;
7889     ## Reconsume.
7890     redo A;
7891     }
7892     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7893     if ($is_space->{$self->{nc}}) {
7894     ## Stay in the state.
7895    
7896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7897     $self->{line_prev} = $self->{line};
7898     $self->{column_prev} = $self->{column};
7899     $self->{column}++;
7900     $self->{nc}
7901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7902     } else {
7903     $self->{set_nc}->($self);
7904     }
7905    
7906     redo A;
7907     } elsif ($self->{nc} == 0x003E) { # >
7908     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7910    
7911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7912     $self->{line_prev} = $self->{line};
7913     $self->{column_prev} = $self->{column};
7914     $self->{column}++;
7915     $self->{nc}
7916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7917     } else {
7918     $self->{set_nc}->($self);
7919     }
7920    
7921     return ($self->{ct}); # ENTITY
7922     redo A;
7923     } elsif ($self->{nc} == -1) {
7924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7926    
7927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7928     $self->{line_prev} = $self->{line};
7929     $self->{column_prev} = $self->{column};
7930     $self->{column}++;
7931     $self->{nc}
7932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7933     } else {
7934     $self->{set_nc}->($self);
7935     }
7936    
7937     return ($self->{ct}); # ENTITY
7938     redo A;
7939     } else {
7940     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7941     $self->{state} = NOTATION_NAME_STATE;
7942    
7943     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7944     $self->{line_prev} = $self->{line};
7945     $self->{column_prev} = $self->{column};
7946     $self->{column}++;
7947     $self->{nc}
7948     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7949     } else {
7950     $self->{set_nc}->($self);
7951     }
7952    
7953     redo A;
7954     }
7955     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7956     if ($is_space->{$self->{nc}}) {
7957 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7958 wakaba 1.18
7959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7960     $self->{line_prev} = $self->{line};
7961     $self->{column_prev} = $self->{column};
7962     $self->{column}++;
7963     $self->{nc}
7964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7965     } else {
7966     $self->{set_nc}->($self);
7967     }
7968    
7969     redo A;
7970     } elsif ($self->{nc} == 0x003E) { # >
7971     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7972    
7973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7974     $self->{line_prev} = $self->{line};
7975     $self->{column_prev} = $self->{column};
7976     $self->{column}++;
7977     $self->{nc}
7978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7979     } else {
7980     $self->{set_nc}->($self);
7981     }
7982    
7983     return ($self->{ct}); # ENTITY
7984     redo A;
7985     } elsif ($self->{nc} == -1) {
7986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7987     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7988    
7989     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7990     $self->{line_prev} = $self->{line};
7991     $self->{column_prev} = $self->{column};
7992     $self->{column}++;
7993     $self->{nc}
7994     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7995     } else {
7996     $self->{set_nc}->($self);
7997     }
7998    
7999     return ($self->{ct}); # ENTITY
8000     redo A;
8001     } else {
8002     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8003     ## Stay in the state.
8004    
8005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8006     $self->{line_prev} = $self->{line};
8007     $self->{column_prev} = $self->{column};
8008     $self->{column}++;
8009     $self->{nc}
8010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8011     } else {
8012     $self->{set_nc}->($self);
8013     }
8014    
8015     redo A;
8016     }
8017 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8018     if ($self->{nc} == 0x0022) { # "
8019 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8020 wakaba 1.19
8021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8022     $self->{line_prev} = $self->{line};
8023     $self->{column_prev} = $self->{column};
8024     $self->{column}++;
8025     $self->{nc}
8026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8027     } else {
8028     $self->{set_nc}->($self);
8029     }
8030    
8031     redo A;
8032     } elsif ($self->{nc} == 0x0026) { # &
8033     $self->{prev_state} = $self->{state};
8034     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8035     $self->{entity_add} = 0x0022; # "
8036    
8037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8038     $self->{line_prev} = $self->{line};
8039     $self->{column_prev} = $self->{column};
8040     $self->{column}++;
8041     $self->{nc}
8042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8043     } else {
8044     $self->{set_nc}->($self);
8045     }
8046    
8047     redo A;
8048     ## TODO: %
8049     } elsif ($self->{nc} == -1) {
8050     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8051     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8052     ## Reconsume.
8053     return ($self->{ct}); # ENTITY
8054     redo A;
8055     } else {
8056     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8057    
8058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8059     $self->{line_prev} = $self->{line};
8060     $self->{column_prev} = $self->{column};
8061     $self->{column}++;
8062     $self->{nc}
8063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8064     } else {
8065     $self->{set_nc}->($self);
8066     }
8067    
8068     redo A;
8069     }
8070     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8071     if ($self->{nc} == 0x0027) { # '
8072 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8073 wakaba 1.19
8074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8075     $self->{line_prev} = $self->{line};
8076     $self->{column_prev} = $self->{column};
8077     $self->{column}++;
8078     $self->{nc}
8079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8080     } else {
8081     $self->{set_nc}->($self);
8082     }
8083    
8084     redo A;
8085     } elsif ($self->{nc} == 0x0026) { # &
8086     $self->{prev_state} = $self->{state};
8087     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8088     $self->{entity_add} = 0x0027; # '
8089    
8090     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8091     $self->{line_prev} = $self->{line};
8092     $self->{column_prev} = $self->{column};
8093     $self->{column}++;
8094     $self->{nc}
8095     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8096     } else {
8097     $self->{set_nc}->($self);
8098     }
8099    
8100     redo A;
8101     ## TODO: %
8102     } elsif ($self->{nc} == -1) {
8103     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8104     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8105     ## Reconsume.
8106     return ($self->{ct}); # ENTITY
8107     redo A;
8108     } else {
8109     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8110    
8111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8112     $self->{line_prev} = $self->{line};
8113     $self->{column_prev} = $self->{column};
8114     $self->{column}++;
8115     $self->{nc}
8116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8117     } else {
8118     $self->{set_nc}->($self);
8119     }
8120    
8121     redo A;
8122     }
8123     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8124     if ($is_space->{$self->{nc}} or
8125     {
8126     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8127     $self->{entity_add} => 1,
8128     }->{$self->{nc}}) {
8129 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8130     line => $self->{line_prev},
8131     column => $self->{column_prev}
8132     + ($self->{nc} == -1 ? 1 : 0));
8133 wakaba 1.19 ## Don't consume
8134     ## Return nothing.
8135     #
8136     } elsif ($self->{nc} == 0x0023) { # #
8137     $self->{ca} = $self->{ct};
8138     $self->{state} = ENTITY_HASH_STATE;
8139     $self->{kwd} = '#';
8140    
8141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8142     $self->{line_prev} = $self->{line};
8143     $self->{column_prev} = $self->{column};
8144     $self->{column}++;
8145     $self->{nc}
8146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8147     } else {
8148     $self->{set_nc}->($self);
8149     }
8150    
8151     redo A;
8152     } else {
8153     #
8154     }
8155    
8156     $self->{ct}->{value} .= '&';
8157     $self->{state} = $self->{prev_state};
8158     ## Reconsume.
8159     redo A;
8160 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8161     if ($is_space->{$self->{nc}}) {
8162     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8163    
8164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165     $self->{line_prev} = $self->{line};
8166     $self->{column_prev} = $self->{column};
8167     $self->{column}++;
8168     $self->{nc}
8169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170     } else {
8171     $self->{set_nc}->($self);
8172     }
8173    
8174     redo A;
8175     } elsif ($self->{nc} == 0x0028) { # (
8176     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8177     $self->{ct}->{content} = ['('];
8178     $self->{group_depth} = 1;
8179    
8180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8181     $self->{line_prev} = $self->{line};
8182     $self->{column_prev} = $self->{column};
8183     $self->{column}++;
8184     $self->{nc}
8185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8186     } else {
8187     $self->{set_nc}->($self);
8188     }
8189    
8190     redo A;
8191     } elsif ($self->{nc} == 0x003E) { # >
8192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8193     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8194    
8195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8196     $self->{line_prev} = $self->{line};
8197     $self->{column_prev} = $self->{column};
8198     $self->{column}++;
8199     $self->{nc}
8200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8201     } else {
8202     $self->{set_nc}->($self);
8203     }
8204    
8205     return ($self->{ct}); # ELEMENT
8206     redo A;
8207     } elsif ($self->{nc} == -1) {
8208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8210    
8211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8212     $self->{line_prev} = $self->{line};
8213     $self->{column_prev} = $self->{column};
8214     $self->{column}++;
8215     $self->{nc}
8216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8217     } else {
8218     $self->{set_nc}->($self);
8219     }
8220    
8221     return ($self->{ct}); # ELEMENT
8222     redo A;
8223     } else {
8224     $self->{ct}->{content} = [chr $self->{nc}];
8225     $self->{state} = CONTENT_KEYWORD_STATE;
8226    
8227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8228     $self->{line_prev} = $self->{line};
8229     $self->{column_prev} = $self->{column};
8230     $self->{column}++;
8231     $self->{nc}
8232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8233     } else {
8234     $self->{set_nc}->($self);
8235     }
8236    
8237     redo A;
8238     }
8239     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8240     if ($is_space->{$self->{nc}}) {
8241     $self->{state} = AFTER_MD_DEF_STATE;
8242    
8243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244     $self->{line_prev} = $self->{line};
8245     $self->{column_prev} = $self->{column};
8246     $self->{column}++;
8247     $self->{nc}
8248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249     } else {
8250     $self->{set_nc}->($self);
8251     }
8252    
8253     redo A;
8254     } elsif ($self->{nc} == 0x003E) { # >
8255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8256    
8257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8258     $self->{line_prev} = $self->{line};
8259     $self->{column_prev} = $self->{column};
8260     $self->{column}++;
8261     $self->{nc}
8262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8263     } else {
8264     $self->{set_nc}->($self);
8265     }
8266    
8267     return ($self->{ct}); # ELEMENT
8268     redo A;
8269     } elsif ($self->{nc} == -1) {
8270     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8271     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272    
8273     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274     $self->{line_prev} = $self->{line};
8275     $self->{column_prev} = $self->{column};
8276     $self->{column}++;
8277     $self->{nc}
8278     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279     } else {
8280     $self->{set_nc}->($self);
8281     }
8282    
8283     return ($self->{ct}); # ELEMENT
8284     redo A;
8285     } else {
8286     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8287     ## Stay in the state.
8288    
8289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290     $self->{line_prev} = $self->{line};
8291     $self->{column_prev} = $self->{column};
8292     $self->{column}++;
8293     $self->{nc}
8294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295     } else {
8296     $self->{set_nc}->($self);
8297     }
8298    
8299     redo A;
8300     }
8301     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8302     if ($is_space->{$self->{nc}}) {
8303     ## Stay in the state.
8304    
8305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306     $self->{line_prev} = $self->{line};
8307     $self->{column_prev} = $self->{column};
8308     $self->{column}++;
8309     $self->{nc}
8310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311     } else {
8312     $self->{set_nc}->($self);
8313     }
8314    
8315     redo A;
8316     } elsif ($self->{nc} == 0x0028) { # (
8317     $self->{group_depth}++;
8318     push @{$self->{ct}->{content}}, chr $self->{nc};
8319     ## Stay in the state.
8320    
8321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322     $self->{line_prev} = $self->{line};
8323     $self->{column_prev} = $self->{column};
8324     $self->{column}++;
8325     $self->{nc}
8326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327     } else {
8328     $self->{set_nc}->($self);
8329     }
8330    
8331     redo A;
8332     } elsif ($self->{nc} == 0x007C or # |
8333     $self->{nc} == 0x002C) { # ,
8334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8335     ## Stay in the state.
8336    
8337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338     $self->{line_prev} = $self->{line};
8339     $self->{column_prev} = $self->{column};
8340     $self->{column}++;
8341     $self->{nc}
8342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343     } else {
8344     $self->{set_nc}->($self);
8345     }
8346    
8347     redo A;
8348     } elsif ($self->{nc} == 0x0029) { # )
8349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8350     push @{$self->{ct}->{content}}, chr $self->{nc};
8351     $self->{group_depth}--;
8352     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8353    
8354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8355     $self->{line_prev} = $self->{line};
8356     $self->{column_prev} = $self->{column};
8357     $self->{column}++;
8358     $self->{nc}
8359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8360     } else {
8361     $self->{set_nc}->($self);
8362     }
8363    
8364     redo A;
8365     } elsif ($self->{nc} == 0x003E) { # >
8366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8367     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369    
8370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371     $self->{line_prev} = $self->{line};
8372     $self->{column_prev} = $self->{column};
8373     $self->{column}++;
8374     $self->{nc}
8375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376     } else {
8377     $self->{set_nc}->($self);
8378     }
8379    
8380     return ($self->{ct}); # ELEMENT
8381     redo A;
8382     } elsif ($self->{nc} == -1) {
8383     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8384     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8386    
8387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388     $self->{line_prev} = $self->{line};
8389     $self->{column_prev} = $self->{column};
8390     $self->{column}++;
8391     $self->{nc}
8392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393     } else {
8394     $self->{set_nc}->($self);
8395     }
8396    
8397     return ($self->{ct}); # ELEMENT
8398     redo A;
8399     } else {
8400     push @{$self->{ct}->{content}}, chr $self->{nc};
8401     $self->{state} = CM_ELEMENT_NAME_STATE;
8402    
8403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8404     $self->{line_prev} = $self->{line};
8405     $self->{column_prev} = $self->{column};
8406     $self->{column}++;
8407     $self->{nc}
8408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8409     } else {
8410     $self->{set_nc}->($self);
8411     }
8412    
8413     redo A;
8414     }
8415     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8416     if ($is_space->{$self->{nc}}) {
8417     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8418    
8419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8420     $self->{line_prev} = $self->{line};
8421     $self->{column_prev} = $self->{column};
8422     $self->{column}++;
8423     $self->{nc}
8424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8425     } else {
8426     $self->{set_nc}->($self);
8427     }
8428    
8429     redo A;
8430     } elsif ($self->{nc} == 0x002A or # *
8431     $self->{nc} == 0x002B or # +
8432     $self->{nc} == 0x003F) { # ?
8433     push @{$self->{ct}->{content}}, chr $self->{nc};
8434     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8435    
8436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8437     $self->{line_prev} = $self->{line};
8438     $self->{column_prev} = $self->{column};
8439     $self->{column}++;
8440     $self->{nc}
8441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8442     } else {
8443     $self->{set_nc}->($self);
8444     }
8445    
8446     redo A;
8447     } elsif ($self->{nc} == 0x007C or # |
8448     $self->{nc} == 0x002C) { # ,
8449     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8450     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8451    
8452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8453     $self->{line_prev} = $self->{line};
8454     $self->{column_prev} = $self->{column};
8455     $self->{column}++;
8456     $self->{nc}
8457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8458     } else {
8459     $self->{set_nc}->($self);
8460     }
8461    
8462     redo A;
8463     } elsif ($self->{nc} == 0x0029) { # )
8464     $self->{group_depth}--;
8465     push @{$self->{ct}->{content}}, chr $self->{nc};
8466     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8467    
8468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8469     $self->{line_prev} = $self->{line};
8470     $self->{column_prev} = $self->{column};
8471     $self->{column}++;
8472     $self->{nc}
8473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8474     } else {
8475     $self->{set_nc}->($self);
8476     }
8477    
8478     redo A;
8479     } elsif ($self->{nc} == 0x003E) { # >
8480     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8481     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8482     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8483    
8484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485     $self->{line_prev} = $self->{line};
8486     $self->{column_prev} = $self->{column};
8487     $self->{column}++;
8488     $self->{nc}
8489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490     } else {
8491     $self->{set_nc}->($self);
8492     }
8493    
8494     return ($self->{ct}); # ELEMENT
8495     redo A;
8496     } elsif ($self->{nc} == -1) {
8497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8498     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8499     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8500    
8501     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8502     $self->{line_prev} = $self->{line};
8503     $self->{column_prev} = $self->{column};
8504     $self->{column}++;
8505     $self->{nc}
8506     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8507     } else {
8508     $self->{set_nc}->($self);
8509     }
8510    
8511     return ($self->{ct}); # ELEMENT
8512     redo A;
8513     } else {
8514     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8515     ## Stay in the state.
8516    
8517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8518     $self->{line_prev} = $self->{line};
8519     $self->{column_prev} = $self->{column};
8520     $self->{column}++;
8521     $self->{nc}
8522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8523     } else {
8524     $self->{set_nc}->($self);
8525     }
8526    
8527     redo A;
8528     }
8529     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8530     if ($is_space->{$self->{nc}}) {
8531     ## Stay in the state.
8532    
8533     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8534     $self->{line_prev} = $self->{line};
8535     $self->{column_prev} = $self->{column};
8536     $self->{column}++;
8537     $self->{nc}
8538     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8539     } else {
8540     $self->{set_nc}->($self);
8541     }
8542    
8543     redo A;
8544     } elsif ($self->{nc} == 0x007C or # |
8545     $self->{nc} == 0x002C) { # ,
8546     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8547     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8548    
8549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550     $self->{line_prev} = $self->{line};
8551     $self->{column_prev} = $self->{column};
8552     $self->{column}++;
8553     $self->{nc}
8554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555     } else {
8556     $self->{set_nc}->($self);
8557     }
8558    
8559     redo A;
8560     } elsif ($self->{nc} == 0x0029) { # )
8561     $self->{group_depth}--;
8562     push @{$self->{ct}->{content}}, chr $self->{nc};
8563     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8564    
8565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8566     $self->{line_prev} = $self->{line};
8567     $self->{column_prev} = $self->{column};
8568     $self->{column}++;
8569     $self->{nc}
8570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8571     } else {
8572     $self->{set_nc}->($self);
8573     }
8574    
8575     redo A;
8576     } elsif ($self->{nc} == 0x003E) { # >
8577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8578     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8579     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580    
8581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582     $self->{line_prev} = $self->{line};
8583     $self->{column_prev} = $self->{column};
8584     $self->{column}++;
8585     $self->{nc}
8586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587     } else {
8588     $self->{set_nc}->($self);
8589     }
8590    
8591     return ($self->{ct}); # ELEMENT
8592     redo A;
8593     } elsif ($self->{nc} == -1) {
8594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8597    
8598     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8599     $self->{line_prev} = $self->{line};
8600     $self->{column_prev} = $self->{column};
8601     $self->{column}++;
8602     $self->{nc}
8603     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8604     } else {
8605     $self->{set_nc}->($self);
8606     }
8607    
8608     return ($self->{ct}); # ELEMENT
8609     redo A;
8610     } else {
8611     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8612     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8613     $self->{state} = BOGUS_MD_STATE;
8614    
8615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8616     $self->{line_prev} = $self->{line};
8617     $self->{column_prev} = $self->{column};
8618     $self->{column}++;
8619     $self->{nc}
8620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8621     } else {
8622     $self->{set_nc}->($self);
8623     }
8624    
8625     redo A;
8626     }
8627     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8628     if ($is_space->{$self->{nc}}) {
8629     if ($self->{group_depth}) {
8630     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8631     } else {
8632     $self->{state} = AFTER_MD_DEF_STATE;
8633     }
8634    
8635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8636     $self->{line_prev} = $self->{line};
8637     $self->{column_prev} = $self->{column};
8638     $self->{column}++;
8639     $self->{nc}
8640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8641     } else {
8642     $self->{set_nc}->($self);
8643     }
8644    
8645     redo A;
8646     } elsif ($self->{nc} == 0x002A or # *
8647     $self->{nc} == 0x002B or # +
8648     $self->{nc} == 0x003F) { # ?
8649     push @{$self->{ct}->{content}}, chr $self->{nc};
8650     if ($self->{group_depth}) {
8651     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8652     } else {
8653     $self->{state} = AFTER_MD_DEF_STATE;
8654     }
8655    
8656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8657     $self->{line_prev} = $self->{line};
8658     $self->{column_prev} = $self->{column};
8659     $self->{column}++;
8660     $self->{nc}
8661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8662     } else {
8663     $self->{set_nc}->($self);
8664     }
8665    
8666     redo A;
8667     } elsif ($self->{nc} == 0x0029) { # )
8668     if ($self->{group_depth}) {
8669     $self->{group_depth}--;
8670     push @{$self->{ct}->{content}}, chr $self->{nc};
8671     ## Stay in the state.
8672    
8673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8674     $self->{line_prev} = $self->{line};
8675     $self->{column_prev} = $self->{column};
8676     $self->{column}++;
8677     $self->{nc}
8678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8679     } else {
8680     $self->{set_nc}->($self);
8681     }
8682    
8683     redo A;
8684     } else {
8685     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8686     $self->{state} = BOGUS_MD_STATE;
8687     ## Reconsume.
8688     redo A;
8689     }
8690     } elsif ($self->{nc} == 0x003E) { # >
8691     if ($self->{group_depth}) {
8692     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8693     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8694     }
8695     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8696    
8697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8698     $self->{line_prev} = $self->{line};
8699     $self->{column_prev} = $self->{column};
8700     $self->{column}++;
8701     $self->{nc}
8702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8703     } else {
8704     $self->{set_nc}->($self);
8705     }
8706    
8707     return ($self->{ct}); # ELEMENT
8708     redo A;
8709     } elsif ($self->{nc} == -1) {
8710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8711     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8712     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8713    
8714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8715     $self->{line_prev} = $self->{line};
8716     $self->{column_prev} = $self->{column};
8717     $self->{column}++;
8718     $self->{nc}
8719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8720     } else {
8721     $self->{set_nc}->($self);
8722     }
8723    
8724     return ($self->{ct}); # ELEMENT
8725     redo A;
8726     } else {
8727     if ($self->{group_depth}) {
8728     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8729     } else {
8730     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8731     $self->{state} = BOGUS_MD_STATE;
8732     }
8733     ## Reconsume.
8734     redo A;
8735     }
8736     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8737 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8738     ## Stay in the state.
8739    
8740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8741     $self->{line_prev} = $self->{line};
8742     $self->{column_prev} = $self->{column};
8743     $self->{column}++;
8744     $self->{nc}
8745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8746     } else {
8747     $self->{set_nc}->($self);
8748     }
8749    
8750     redo A;
8751     } elsif ($self->{nc} == 0x003E) { # >
8752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8753    
8754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8755     $self->{line_prev} = $self->{line};
8756     $self->{column_prev} = $self->{column};
8757     $self->{column}++;
8758     $self->{nc}
8759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8760     } else {
8761     $self->{set_nc}->($self);
8762     }
8763    
8764 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8765 wakaba 1.18 redo A;
8766     } elsif ($self->{nc} == -1) {
8767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8768     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8769    
8770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8771     $self->{line_prev} = $self->{line};
8772     $self->{column_prev} = $self->{column};
8773     $self->{column}++;
8774     $self->{nc}
8775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8776     } else {
8777     $self->{set_nc}->($self);
8778     }
8779    
8780 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8781 wakaba 1.18 redo A;
8782     } else {
8783 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8784 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8785     ## Reconsume.
8786     redo A;
8787     }
8788 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8789     if ($self->{nc} == 0x003E) { # >
8790     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8791    
8792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8793     $self->{line_prev} = $self->{line};
8794     $self->{column_prev} = $self->{column};
8795     $self->{column}++;
8796     $self->{nc}
8797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8798     } else {
8799     $self->{set_nc}->($self);
8800     }
8801    
8802     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8803     redo A;
8804     } elsif ($self->{nc} == -1) {
8805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8806     ## Reconsume.
8807     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8808     redo A;
8809     } else {
8810     ## Stay in the state.
8811    
8812     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8813     $self->{line_prev} = $self->{line};
8814     $self->{column_prev} = $self->{column};
8815     $self->{column}++;
8816     $self->{nc}
8817     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8818     } else {
8819     $self->{set_nc}->($self);
8820     }
8821    
8822     redo A;
8823     }
8824 wakaba 1.1 } else {
8825     die "$0: $self->{state}: Unknown state";
8826     }
8827     } # A
8828    
8829     die "$0: _get_next_token: unexpected case";
8830     } # _get_next_token
8831    
8832     1;
8833 wakaba 1.32 ## $Date: 2009/09/05 09:26:55 $
8834 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24