/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.34 - (hide annotations) (download)
Sat Sep 5 11:31:58 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.33: +11 -10 lines
++ whatpm/t/ChangeLog	5 Sep 2009 11:31:07 -0000
	* tokenizer-test-1.test: Changed to keep non-normal character
	references (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 11:31:46 -0000
	* Tokenizer.pm.src: Changed to keep non-normal character
	references as is (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.34 our $VERSION=do{my @r=(q$Revision: 1.33 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209 wakaba 1.34 0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210 wakaba 1.1 0x0D => 0x000A,
211     0x80 => 0x20AC,
212 wakaba 1.34 0x81 => 0x0081,
213 wakaba 1.1 0x82 => 0x201A,
214     0x83 => 0x0192,
215     0x84 => 0x201E,
216     0x85 => 0x2026,
217     0x86 => 0x2020,
218     0x87 => 0x2021,
219     0x88 => 0x02C6,
220     0x89 => 0x2030,
221     0x8A => 0x0160,
222     0x8B => 0x2039,
223     0x8C => 0x0152,
224 wakaba 1.34 0x8D => 0x008D,
225 wakaba 1.1 0x8E => 0x017D,
226 wakaba 1.34 0x8F => 0x008F,
227     0x90 => 0x0090,
228 wakaba 1.1 0x91 => 0x2018,
229     0x92 => 0x2019,
230     0x93 => 0x201C,
231     0x94 => 0x201D,
232     0x95 => 0x2022,
233     0x96 => 0x2013,
234     0x97 => 0x2014,
235     0x98 => 0x02DC,
236     0x99 => 0x2122,
237     0x9A => 0x0161,
238     0x9B => 0x203A,
239     0x9C => 0x0153,
240 wakaba 1.34 0x9D => 0x009D,
241 wakaba 1.1 0x9E => 0x017E,
242     0x9F => 0x0178,
243     }; # $charref_map
244 wakaba 1.34 $charref_map->{$_} = $_
245     for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246     0xD800..0xDFFF, 0xFDD0..0xFDEF,
247 wakaba 1.1 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
250     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
251     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
252    
253     ## Implementations MUST act as if state machine in the spec
254    
255     sub _initialize_tokenizer ($) {
256     my $self = shift;
257    
258     ## NOTE: Fields set by |new| constructor:
259     #$self->{level}
260     #$self->{set_nc}
261     #$self->{parse_error}
262 wakaba 1.3 #$self->{is_xml} (if XML)
263 wakaba 1.1
264     $self->{state} = DATA_STATE; # MUST
265 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
266     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
267 wakaba 1.1 #$self->{entity__value}; # initialized when used
268     #$self->{entity__match}; # initialized when used
269     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
270     undef $self->{ct}; # current token
271     undef $self->{ca}; # current attribute
272     undef $self->{last_stag_name}; # last emitted start tag name
273     #$self->{prev_state}; # initialized when used
274     delete $self->{self_closing};
275     $self->{char_buffer} = '';
276     $self->{char_buffer_pos} = 0;
277     $self->{nc} = -1; # next input character
278     #$self->{next_nc}
279    
280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
281     $self->{line_prev} = $self->{line};
282     $self->{column_prev} = $self->{column};
283     $self->{column}++;
284     $self->{nc}
285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
286     } else {
287     $self->{set_nc}->($self);
288     }
289    
290     $self->{token} = [];
291     # $self->{escape}
292     } # _initialize_tokenizer
293    
294     ## A token has:
295     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
296 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
297 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
298     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
299 wakaba 1.11 ## ->{target} (PI_TOKEN)
300 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
301     ## ->{sysid} (DOCTYPE_TOKEN)
302     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
303     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
304     ## ->{name}
305     ## ->{value}
306     ## ->{has_reference} == 1 or 0
307 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
308     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
309 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
310 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
311 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
312    
313 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
314     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
315     ## while the token is pushed back to the stack.
316    
317     ## Emitted token MUST immediately be handled by the tree construction state.
318    
319     ## Before each step, UA MAY check to see if either one of the scripts in
320     ## "list of scripts that will execute as soon as possible" or the first
321     ## script in the "list of scripts that will execute asynchronously",
322     ## has completed loading. If one has, then it MUST be executed
323     ## and removed from the list.
324    
325     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
326     ## (This requirement was dropped from HTML5 spec, unfortunately.)
327    
328     my $is_space = {
329     0x0009 => 1, # CHARACTER TABULATION (HT)
330     0x000A => 1, # LINE FEED (LF)
331     #0x000B => 0, # LINE TABULATION (VT)
332 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
333 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
334     0x0020 => 1, # SPACE (SP)
335     };
336    
337     sub _get_next_token ($) {
338     my $self = shift;
339    
340     if ($self->{self_closing}) {
341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
342     ## NOTE: The |self_closing| flag is only set by start tag token.
343     ## In addition, when a start tag token is emitted, it is always set to
344     ## |ct|.
345     delete $self->{self_closing};
346     }
347    
348     if (@{$self->{token}}) {
349     $self->{self_closing} = $self->{token}->[0]->{self_closing};
350     return shift @{$self->{token}};
351     }
352    
353     A: {
354     if ($self->{state} == PCDATA_STATE) {
355     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
356    
357     if ($self->{nc} == 0x0026) { # &
358    
359     ## NOTE: In the spec, the tokenizer is switched to the
360     ## "entity data state". In this implementation, the tokenizer
361     ## is switched to the |ENTITY_STATE|, which is an implementation
362     ## of the "consume a character reference" algorithm.
363     $self->{entity_add} = -1;
364     $self->{prev_state} = DATA_STATE;
365     $self->{state} = ENTITY_STATE;
366    
367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
368     $self->{line_prev} = $self->{line};
369     $self->{column_prev} = $self->{column};
370     $self->{column}++;
371     $self->{nc}
372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
373     } else {
374     $self->{set_nc}->($self);
375     }
376    
377     redo A;
378     } elsif ($self->{nc} == 0x003C) { # <
379    
380     $self->{state} = TAG_OPEN_STATE;
381    
382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
383     $self->{line_prev} = $self->{line};
384     $self->{column_prev} = $self->{column};
385     $self->{column}++;
386     $self->{nc}
387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
388     } else {
389     $self->{set_nc}->($self);
390     }
391    
392     redo A;
393     } elsif ($self->{nc} == -1) {
394    
395     return ({type => END_OF_FILE_TOKEN,
396     line => $self->{line}, column => $self->{column}});
397     last A; ## TODO: ok?
398     } else {
399    
400     #
401     }
402    
403     # Anything else
404     my $token = {type => CHARACTER_TOKEN,
405     data => chr $self->{nc},
406     line => $self->{line}, column => $self->{column},
407     };
408     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
409    
410     ## Stay in the state.
411    
412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
413     $self->{line_prev} = $self->{line};
414     $self->{column_prev} = $self->{column};
415     $self->{column}++;
416     $self->{nc}
417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
418     } else {
419     $self->{set_nc}->($self);
420     }
421    
422     return ($token);
423     redo A;
424     } elsif ($self->{state} == DATA_STATE) {
425     $self->{s_kwd} = '' unless defined $self->{s_kwd};
426     if ($self->{nc} == 0x0026) { # &
427     $self->{s_kwd} = '';
428     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
429     not $self->{escape}) {
430    
431     ## NOTE: In the spec, the tokenizer is switched to the
432     ## "entity data state". In this implementation, the tokenizer
433     ## is switched to the |ENTITY_STATE|, which is an implementation
434     ## of the "consume a character reference" algorithm.
435     $self->{entity_add} = -1;
436     $self->{prev_state} = DATA_STATE;
437     $self->{state} = ENTITY_STATE;
438    
439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
440     $self->{line_prev} = $self->{line};
441     $self->{column_prev} = $self->{column};
442     $self->{column}++;
443     $self->{nc}
444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
445     } else {
446     $self->{set_nc}->($self);
447     }
448    
449     redo A;
450     } else {
451    
452     #
453     }
454     } elsif ($self->{nc} == 0x002D) { # -
455     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
456 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
457 wakaba 1.1
458     $self->{escape} = 1; # unless $self->{escape};
459     $self->{s_kwd} = '--';
460     #
461 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
462 wakaba 1.1
463     $self->{s_kwd} = '--';
464     #
465 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
466    
467     $self->{s_kwd} .= '-';
468     #
469 wakaba 1.1 } else {
470    
471 wakaba 1.5 $self->{s_kwd} = '-';
472 wakaba 1.1 #
473     }
474     }
475    
476     #
477     } elsif ($self->{nc} == 0x0021) { # !
478     if (length $self->{s_kwd}) {
479    
480     $self->{s_kwd} .= '!';
481     #
482     } else {
483    
484     #$self->{s_kwd} = '';
485     #
486     }
487     #
488     } elsif ($self->{nc} == 0x003C) { # <
489     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
490     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
491     not $self->{escape})) {
492    
493     $self->{state} = TAG_OPEN_STATE;
494    
495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
496     $self->{line_prev} = $self->{line};
497     $self->{column_prev} = $self->{column};
498     $self->{column}++;
499     $self->{nc}
500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
501     } else {
502     $self->{set_nc}->($self);
503     }
504    
505     redo A;
506     } else {
507    
508     $self->{s_kwd} = '';
509     #
510     }
511     } elsif ($self->{nc} == 0x003E) { # >
512     if ($self->{escape} and
513     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
514     if ($self->{s_kwd} eq '--') {
515    
516     delete $self->{escape};
517 wakaba 1.5 #
518 wakaba 1.1 } else {
519    
520 wakaba 1.5 #
521 wakaba 1.1 }
522 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
523    
524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
525     line => $self->{line_prev},
526     column => $self->{column_prev} - 1);
527     #
528 wakaba 1.1 } else {
529    
530 wakaba 1.5 #
531 wakaba 1.1 }
532    
533     $self->{s_kwd} = '';
534     #
535 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
536     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
537    
538     $self->{s_kwd} .= ']';
539     } elsif ($self->{s_kwd} eq ']]') {
540    
541     #
542     } else {
543    
544     $self->{s_kwd} = '';
545     }
546     #
547 wakaba 1.1 } elsif ($self->{nc} == -1) {
548    
549     $self->{s_kwd} = '';
550     return ({type => END_OF_FILE_TOKEN,
551     line => $self->{line}, column => $self->{column}});
552     last A; ## TODO: ok?
553     } else {
554    
555     $self->{s_kwd} = '';
556     #
557     }
558    
559     # Anything else
560     my $token = {type => CHARACTER_TOKEN,
561     data => chr $self->{nc},
562     line => $self->{line}, column => $self->{column},
563     };
564 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
565 wakaba 1.1 length $token->{data})) {
566     $self->{s_kwd} = '';
567     }
568    
569     ## Stay in the data state.
570 wakaba 1.5 if (not $self->{is_xml} and
571     $self->{content_model} == PCDATA_CONTENT_MODEL) {
572 wakaba 1.1
573     $self->{state} = PCDATA_STATE;
574     } else {
575    
576     ## Stay in the state.
577     }
578    
579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
580     $self->{line_prev} = $self->{line};
581     $self->{column_prev} = $self->{column};
582     $self->{column}++;
583     $self->{nc}
584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
585     } else {
586     $self->{set_nc}->($self);
587     }
588    
589     return ($token);
590     redo A;
591     } elsif ($self->{state} == TAG_OPEN_STATE) {
592 wakaba 1.10 ## XML5: "tag state".
593    
594 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
595     if ($self->{nc} == 0x002F) { # /
596    
597    
598     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
599     $self->{line_prev} = $self->{line};
600     $self->{column_prev} = $self->{column};
601     $self->{column}++;
602     $self->{nc}
603     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
604     } else {
605     $self->{set_nc}->($self);
606     }
607    
608     $self->{state} = CLOSE_TAG_OPEN_STATE;
609     redo A;
610     } elsif ($self->{nc} == 0x0021) { # !
611    
612 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
613 wakaba 1.1 #
614     } else {
615    
616 wakaba 1.12 $self->{s_kwd} = '';
617 wakaba 1.1 #
618     }
619    
620     ## reconsume
621     $self->{state} = DATA_STATE;
622     return ({type => CHARACTER_TOKEN, data => '<',
623     line => $self->{line_prev},
624     column => $self->{column_prev},
625     });
626     redo A;
627     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
628     if ($self->{nc} == 0x0021) { # !
629    
630     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
631    
632     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
633     $self->{line_prev} = $self->{line};
634     $self->{column_prev} = $self->{column};
635     $self->{column}++;
636     $self->{nc}
637     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
638     } else {
639     $self->{set_nc}->($self);
640     }
641    
642     redo A;
643     } elsif ($self->{nc} == 0x002F) { # /
644    
645     $self->{state} = CLOSE_TAG_OPEN_STATE;
646    
647     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
648     $self->{line_prev} = $self->{line};
649     $self->{column_prev} = $self->{column};
650     $self->{column}++;
651     $self->{nc}
652     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
653     } else {
654     $self->{set_nc}->($self);
655     }
656    
657     redo A;
658     } elsif (0x0041 <= $self->{nc} and
659     $self->{nc} <= 0x005A) { # A..Z
660    
661     $self->{ct}
662     = {type => START_TAG_TOKEN,
663 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
664 wakaba 1.1 line => $self->{line_prev},
665     column => $self->{column_prev}};
666     $self->{state} = TAG_NAME_STATE;
667    
668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
669     $self->{line_prev} = $self->{line};
670     $self->{column_prev} = $self->{column};
671     $self->{column}++;
672     $self->{nc}
673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
674     } else {
675     $self->{set_nc}->($self);
676     }
677    
678     redo A;
679     } elsif (0x0061 <= $self->{nc} and
680     $self->{nc} <= 0x007A) { # a..z
681    
682     $self->{ct} = {type => START_TAG_TOKEN,
683     tag_name => chr ($self->{nc}),
684     line => $self->{line_prev},
685     column => $self->{column_prev}};
686     $self->{state} = TAG_NAME_STATE;
687    
688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
689     $self->{line_prev} = $self->{line};
690     $self->{column_prev} = $self->{column};
691     $self->{column}++;
692     $self->{nc}
693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
694     } else {
695     $self->{set_nc}->($self);
696     }
697    
698     redo A;
699     } elsif ($self->{nc} == 0x003E) { # >
700    
701     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
702     line => $self->{line_prev},
703     column => $self->{column_prev});
704     $self->{state} = DATA_STATE;
705 wakaba 1.5 $self->{s_kwd} = '';
706 wakaba 1.1
707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
708     $self->{line_prev} = $self->{line};
709     $self->{column_prev} = $self->{column};
710     $self->{column}++;
711     $self->{nc}
712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
713     } else {
714     $self->{set_nc}->($self);
715     }
716    
717    
718     return ({type => CHARACTER_TOKEN, data => '<>',
719     line => $self->{line_prev},
720     column => $self->{column_prev},
721     });
722    
723     redo A;
724     } elsif ($self->{nc} == 0x003F) { # ?
725 wakaba 1.8 if ($self->{is_xml}) {
726    
727     $self->{state} = PI_STATE;
728    
729     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
730     $self->{line_prev} = $self->{line};
731     $self->{column_prev} = $self->{column};
732     $self->{column}++;
733     $self->{nc}
734     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
735     } else {
736     $self->{set_nc}->($self);
737     }
738    
739     redo A;
740     } else {
741    
742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
743     line => $self->{line_prev},
744     column => $self->{column_prev});
745     $self->{state} = BOGUS_COMMENT_STATE;
746     $self->{ct} = {type => COMMENT_TOKEN, data => '',
747     line => $self->{line_prev},
748     column => $self->{column_prev},
749     };
750     ## $self->{nc} is intentionally left as is
751     redo A;
752     }
753 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
754 wakaba 1.1
755     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
756     line => $self->{line_prev},
757     column => $self->{column_prev});
758     $self->{state} = DATA_STATE;
759 wakaba 1.5 $self->{s_kwd} = '';
760 wakaba 1.1 ## reconsume
761    
762     return ({type => CHARACTER_TOKEN, data => '<',
763     line => $self->{line_prev},
764     column => $self->{column_prev},
765     });
766    
767     redo A;
768 wakaba 1.9 } else {
769     ## XML5: "<:" is a parse error.
770    
771     $self->{ct} = {type => START_TAG_TOKEN,
772     tag_name => chr ($self->{nc}),
773     line => $self->{line_prev},
774     column => $self->{column_prev}};
775     $self->{state} = TAG_NAME_STATE;
776    
777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
778     $self->{line_prev} = $self->{line};
779     $self->{column_prev} = $self->{column};
780     $self->{column}++;
781     $self->{nc}
782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
783     } else {
784     $self->{set_nc}->($self);
785     }
786    
787     redo A;
788 wakaba 1.1 }
789     } else {
790     die "$0: $self->{content_model} in tag open";
791     }
792     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
793     ## NOTE: The "close tag open state" in the spec is implemented as
794     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
795    
796 wakaba 1.10 ## XML5: "end tag state".
797    
798 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
799     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
800     if (defined $self->{last_stag_name}) {
801     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
802 wakaba 1.12 $self->{kwd} = '';
803 wakaba 1.1 ## Reconsume.
804     redo A;
805     } else {
806     ## No start tag token has ever been emitted
807     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
808    
809     $self->{state} = DATA_STATE;
810 wakaba 1.5 $self->{s_kwd} = '';
811 wakaba 1.1 ## Reconsume.
812     return ({type => CHARACTER_TOKEN, data => '</',
813     line => $l, column => $c,
814     });
815     redo A;
816     }
817     }
818    
819     if (0x0041 <= $self->{nc} and
820     $self->{nc} <= 0x005A) { # A..Z
821    
822     $self->{ct}
823     = {type => END_TAG_TOKEN,
824 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
825 wakaba 1.1 line => $l, column => $c};
826     $self->{state} = TAG_NAME_STATE;
827    
828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829     $self->{line_prev} = $self->{line};
830     $self->{column_prev} = $self->{column};
831     $self->{column}++;
832     $self->{nc}
833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834     } else {
835     $self->{set_nc}->($self);
836     }
837    
838     redo A;
839     } elsif (0x0061 <= $self->{nc} and
840     $self->{nc} <= 0x007A) { # a..z
841    
842     $self->{ct} = {type => END_TAG_TOKEN,
843     tag_name => chr ($self->{nc}),
844     line => $l, column => $c};
845     $self->{state} = TAG_NAME_STATE;
846    
847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
848     $self->{line_prev} = $self->{line};
849     $self->{column_prev} = $self->{column};
850     $self->{column}++;
851     $self->{nc}
852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
853     } else {
854     $self->{set_nc}->($self);
855     }
856    
857     redo A;
858     } elsif ($self->{nc} == 0x003E) { # >
859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
860     line => $self->{line_prev}, ## "<" in "</>"
861     column => $self->{column_prev} - 1);
862     $self->{state} = DATA_STATE;
863 wakaba 1.5 $self->{s_kwd} = '';
864 wakaba 1.10 if ($self->{is_xml}) {
865    
866     ## XML5: No parse error.
867    
868     ## NOTE: This parser raises a parse error, since it supports
869     ## XML1, not XML5.
870    
871     ## NOTE: A short end tag token.
872     my $ct = {type => END_TAG_TOKEN,
873     tag_name => '',
874     line => $self->{line_prev},
875     column => $self->{column_prev} - 1,
876     };
877    
878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879     $self->{line_prev} = $self->{line};
880     $self->{column_prev} = $self->{column};
881     $self->{column}++;
882     $self->{nc}
883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884     } else {
885     $self->{set_nc}->($self);
886     }
887    
888     return ($ct);
889     } else {
890    
891    
892 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
893     $self->{line_prev} = $self->{line};
894     $self->{column_prev} = $self->{column};
895     $self->{column}++;
896     $self->{nc}
897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
898     } else {
899     $self->{set_nc}->($self);
900     }
901    
902 wakaba 1.10 }
903 wakaba 1.1 redo A;
904     } elsif ($self->{nc} == -1) {
905    
906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
907 wakaba 1.5 $self->{s_kwd} = '';
908 wakaba 1.1 $self->{state} = DATA_STATE;
909     # reconsume
910    
911     return ({type => CHARACTER_TOKEN, data => '</',
912     line => $l, column => $c,
913     });
914    
915     redo A;
916 wakaba 1.10 } elsif (not $self->{is_xml} or
917     $is_space->{$self->{nc}}) {
918 wakaba 1.1
919 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
920     line => $self->{line_prev}, # "<" of "</"
921     column => $self->{column_prev} - 1);
922 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
923     $self->{ct} = {type => COMMENT_TOKEN, data => '',
924     line => $self->{line_prev}, # "<" of "</"
925     column => $self->{column_prev} - 1,
926     };
927     ## NOTE: $self->{nc} is intentionally left as is.
928     ## Although the "anything else" case of the spec not explicitly
929     ## states that the next input character is to be reconsumed,
930     ## it will be included to the |data| of the comment token
931     ## generated from the bogus end tag, as defined in the
932     ## "bogus comment state" entry.
933     redo A;
934 wakaba 1.10 } else {
935     ## XML5: "</:" is a parse error.
936    
937     $self->{ct} = {type => END_TAG_TOKEN,
938     tag_name => chr ($self->{nc}),
939     line => $l, column => $c};
940     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
941    
942     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
943     $self->{line_prev} = $self->{line};
944     $self->{column_prev} = $self->{column};
945     $self->{column}++;
946     $self->{nc}
947     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
948     } else {
949     $self->{set_nc}->($self);
950     }
951    
952     redo A;
953 wakaba 1.1 }
954     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
955 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
956 wakaba 1.1 if (length $ch) {
957     my $CH = $ch;
958     $ch =~ tr/a-z/A-Z/;
959     my $nch = chr $self->{nc};
960     if ($nch eq $ch or $nch eq $CH) {
961    
962     ## Stay in the state.
963 wakaba 1.12 $self->{kwd} .= $nch;
964 wakaba 1.1
965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
966     $self->{line_prev} = $self->{line};
967     $self->{column_prev} = $self->{column};
968     $self->{column}++;
969     $self->{nc}
970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
971     } else {
972     $self->{set_nc}->($self);
973     }
974    
975     redo A;
976     } else {
977    
978     $self->{state} = DATA_STATE;
979 wakaba 1.5 $self->{s_kwd} = '';
980 wakaba 1.1 ## Reconsume.
981     return ({type => CHARACTER_TOKEN,
982 wakaba 1.12 data => '</' . $self->{kwd},
983 wakaba 1.1 line => $self->{line_prev},
984 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
985 wakaba 1.1 });
986     redo A;
987     }
988     } else { # after "<{tag-name}"
989     unless ($is_space->{$self->{nc}} or
990     {
991     0x003E => 1, # >
992     0x002F => 1, # /
993     -1 => 1, # EOF
994     }->{$self->{nc}}) {
995    
996     ## Reconsume.
997     $self->{state} = DATA_STATE;
998 wakaba 1.5 $self->{s_kwd} = '';
999 wakaba 1.1 return ({type => CHARACTER_TOKEN,
1000 wakaba 1.12 data => '</' . $self->{kwd},
1001 wakaba 1.1 line => $self->{line_prev},
1002 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1003 wakaba 1.1 });
1004     redo A;
1005     } else {
1006    
1007     $self->{ct}
1008     = {type => END_TAG_TOKEN,
1009     tag_name => $self->{last_stag_name},
1010     line => $self->{line_prev},
1011 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1012 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1013     ## Reconsume.
1014     redo A;
1015     }
1016     }
1017     } elsif ($self->{state} == TAG_NAME_STATE) {
1018     if ($is_space->{$self->{nc}}) {
1019    
1020     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1021    
1022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1023     $self->{line_prev} = $self->{line};
1024     $self->{column_prev} = $self->{column};
1025     $self->{column}++;
1026     $self->{nc}
1027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1028     } else {
1029     $self->{set_nc}->($self);
1030     }
1031    
1032     redo A;
1033     } elsif ($self->{nc} == 0x003E) { # >
1034     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1035    
1036     $self->{last_stag_name} = $self->{ct}->{tag_name};
1037     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1038     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1039     #if ($self->{ct}->{attributes}) {
1040     # ## NOTE: This should never be reached.
1041     # !!! cp (36);
1042     # !!! parse-error (type => 'end tag attribute');
1043     #} else {
1044    
1045     #}
1046     } else {
1047     die "$0: $self->{ct}->{type}: Unknown token type";
1048     }
1049     $self->{state} = DATA_STATE;
1050 wakaba 1.5 $self->{s_kwd} = '';
1051 wakaba 1.1
1052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1053     $self->{line_prev} = $self->{line};
1054     $self->{column_prev} = $self->{column};
1055     $self->{column}++;
1056     $self->{nc}
1057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1058     } else {
1059     $self->{set_nc}->($self);
1060     }
1061    
1062    
1063     return ($self->{ct}); # start tag or end tag
1064    
1065     redo A;
1066     } elsif (0x0041 <= $self->{nc} and
1067     $self->{nc} <= 0x005A) { # A..Z
1068    
1069 wakaba 1.4 $self->{ct}->{tag_name}
1070     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1071 wakaba 1.1 # start tag or end tag
1072     ## Stay in this state
1073    
1074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1075     $self->{line_prev} = $self->{line};
1076     $self->{column_prev} = $self->{column};
1077     $self->{column}++;
1078     $self->{nc}
1079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1080     } else {
1081     $self->{set_nc}->($self);
1082     }
1083    
1084     redo A;
1085     } elsif ($self->{nc} == -1) {
1086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1087     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1088    
1089     $self->{last_stag_name} = $self->{ct}->{tag_name};
1090     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1091     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1092     #if ($self->{ct}->{attributes}) {
1093     # ## NOTE: This state should never be reached.
1094     # !!! cp (40);
1095     # !!! parse-error (type => 'end tag attribute');
1096     #} else {
1097    
1098     #}
1099     } else {
1100     die "$0: $self->{ct}->{type}: Unknown token type";
1101     }
1102     $self->{state} = DATA_STATE;
1103 wakaba 1.5 $self->{s_kwd} = '';
1104 wakaba 1.1 # reconsume
1105    
1106 wakaba 1.33 ## Discard the token.
1107     #return ($self->{ct}); # start tag or end tag
1108 wakaba 1.1
1109     redo A;
1110     } elsif ($self->{nc} == 0x002F) { # /
1111    
1112     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1113    
1114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1115     $self->{line_prev} = $self->{line};
1116     $self->{column_prev} = $self->{column};
1117     $self->{column}++;
1118     $self->{nc}
1119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1120     } else {
1121     $self->{set_nc}->($self);
1122     }
1123    
1124     redo A;
1125     } else {
1126    
1127     $self->{ct}->{tag_name} .= chr $self->{nc};
1128     # start tag or end tag
1129     ## Stay in the state
1130    
1131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132     $self->{line_prev} = $self->{line};
1133     $self->{column_prev} = $self->{column};
1134     $self->{column}++;
1135     $self->{nc}
1136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137     } else {
1138     $self->{set_nc}->($self);
1139     }
1140    
1141     redo A;
1142     }
1143     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1144 wakaba 1.11 ## XML5: "Tag attribute name before state".
1145    
1146 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1147    
1148     ## Stay in the state
1149    
1150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1151     $self->{line_prev} = $self->{line};
1152     $self->{column_prev} = $self->{column};
1153     $self->{column}++;
1154     $self->{nc}
1155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1156     } else {
1157     $self->{set_nc}->($self);
1158     }
1159    
1160     redo A;
1161     } elsif ($self->{nc} == 0x003E) { # >
1162     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1163    
1164     $self->{last_stag_name} = $self->{ct}->{tag_name};
1165     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1166     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1167     if ($self->{ct}->{attributes}) {
1168    
1169     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1170     } else {
1171    
1172     }
1173     } else {
1174     die "$0: $self->{ct}->{type}: Unknown token type";
1175     }
1176     $self->{state} = DATA_STATE;
1177 wakaba 1.5 $self->{s_kwd} = '';
1178 wakaba 1.1
1179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1180     $self->{line_prev} = $self->{line};
1181     $self->{column_prev} = $self->{column};
1182     $self->{column}++;
1183     $self->{nc}
1184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1185     } else {
1186     $self->{set_nc}->($self);
1187     }
1188    
1189    
1190     return ($self->{ct}); # start tag or end tag
1191    
1192     redo A;
1193     } elsif (0x0041 <= $self->{nc} and
1194     $self->{nc} <= 0x005A) { # A..Z
1195    
1196     $self->{ca}
1197 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1198 wakaba 1.1 value => '',
1199     line => $self->{line}, column => $self->{column}};
1200     $self->{state} = ATTRIBUTE_NAME_STATE;
1201    
1202     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1203     $self->{line_prev} = $self->{line};
1204     $self->{column_prev} = $self->{column};
1205     $self->{column}++;
1206     $self->{nc}
1207     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1208     } else {
1209     $self->{set_nc}->($self);
1210     }
1211    
1212     redo A;
1213     } elsif ($self->{nc} == 0x002F) { # /
1214    
1215     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1216    
1217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1218     $self->{line_prev} = $self->{line};
1219     $self->{column_prev} = $self->{column};
1220     $self->{column}++;
1221     $self->{nc}
1222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1223     } else {
1224     $self->{set_nc}->($self);
1225     }
1226    
1227     redo A;
1228     } elsif ($self->{nc} == -1) {
1229     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1230     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1231    
1232     $self->{last_stag_name} = $self->{ct}->{tag_name};
1233     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1234     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1235     if ($self->{ct}->{attributes}) {
1236    
1237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1238     } else {
1239    
1240     }
1241     } else {
1242     die "$0: $self->{ct}->{type}: Unknown token type";
1243     }
1244     $self->{state} = DATA_STATE;
1245 wakaba 1.5 $self->{s_kwd} = '';
1246 wakaba 1.1 # reconsume
1247    
1248 wakaba 1.33 ## Discard the token.
1249     #return ($self->{ct}); # start tag or end tag
1250 wakaba 1.1
1251     redo A;
1252     } else {
1253     if ({
1254     0x0022 => 1, # "
1255     0x0027 => 1, # '
1256 wakaba 1.30 0x003C => 1, # <
1257 wakaba 1.1 0x003D => 1, # =
1258     }->{$self->{nc}}) {
1259    
1260 wakaba 1.11 ## XML5: Not a parse error.
1261 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1262     } else {
1263    
1264 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1265 wakaba 1.1 }
1266     $self->{ca}
1267     = {name => chr ($self->{nc}),
1268     value => '',
1269     line => $self->{line}, column => $self->{column}};
1270     $self->{state} = ATTRIBUTE_NAME_STATE;
1271    
1272     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1273     $self->{line_prev} = $self->{line};
1274     $self->{column_prev} = $self->{column};
1275     $self->{column}++;
1276     $self->{nc}
1277     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1278     } else {
1279     $self->{set_nc}->($self);
1280     }
1281    
1282     redo A;
1283     }
1284     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1285 wakaba 1.11 ## XML5: "Tag attribute name state".
1286    
1287 wakaba 1.1 my $before_leave = sub {
1288     if (exists $self->{ct}->{attributes} # start tag or end tag
1289     ->{$self->{ca}->{name}}) { # MUST
1290    
1291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1292     ## Discard $self->{ca} # MUST
1293     } else {
1294    
1295     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1296     = $self->{ca};
1297 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1298 wakaba 1.1 }
1299     }; # $before_leave
1300    
1301     if ($is_space->{$self->{nc}}) {
1302    
1303     $before_leave->();
1304     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1305    
1306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307     $self->{line_prev} = $self->{line};
1308     $self->{column_prev} = $self->{column};
1309     $self->{column}++;
1310     $self->{nc}
1311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312     } else {
1313     $self->{set_nc}->($self);
1314     }
1315    
1316     redo A;
1317     } elsif ($self->{nc} == 0x003D) { # =
1318    
1319     $before_leave->();
1320     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1321    
1322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1323     $self->{line_prev} = $self->{line};
1324     $self->{column_prev} = $self->{column};
1325     $self->{column}++;
1326     $self->{nc}
1327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1328     } else {
1329     $self->{set_nc}->($self);
1330     }
1331    
1332     redo A;
1333     } elsif ($self->{nc} == 0x003E) { # >
1334 wakaba 1.11 if ($self->{is_xml}) {
1335    
1336     ## XML5: Not a parse error.
1337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1338     } else {
1339    
1340     }
1341    
1342 wakaba 1.1 $before_leave->();
1343     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1344    
1345     $self->{last_stag_name} = $self->{ct}->{tag_name};
1346     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1347    
1348     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1349     if ($self->{ct}->{attributes}) {
1350     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1351     }
1352     } else {
1353     die "$0: $self->{ct}->{type}: Unknown token type";
1354     }
1355     $self->{state} = DATA_STATE;
1356 wakaba 1.5 $self->{s_kwd} = '';
1357 wakaba 1.1
1358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1359     $self->{line_prev} = $self->{line};
1360     $self->{column_prev} = $self->{column};
1361     $self->{column}++;
1362     $self->{nc}
1363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1364     } else {
1365     $self->{set_nc}->($self);
1366     }
1367    
1368    
1369     return ($self->{ct}); # start tag or end tag
1370    
1371     redo A;
1372     } elsif (0x0041 <= $self->{nc} and
1373     $self->{nc} <= 0x005A) { # A..Z
1374    
1375 wakaba 1.4 $self->{ca}->{name}
1376     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1377 wakaba 1.1 ## Stay in the state
1378    
1379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1380     $self->{line_prev} = $self->{line};
1381     $self->{column_prev} = $self->{column};
1382     $self->{column}++;
1383     $self->{nc}
1384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1385     } else {
1386     $self->{set_nc}->($self);
1387     }
1388    
1389     redo A;
1390     } elsif ($self->{nc} == 0x002F) { # /
1391 wakaba 1.11 if ($self->{is_xml}) {
1392    
1393     ## XML5: Not a parse error.
1394     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1395     } else {
1396    
1397     }
1398 wakaba 1.1
1399     $before_leave->();
1400     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1401    
1402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1403     $self->{line_prev} = $self->{line};
1404     $self->{column_prev} = $self->{column};
1405     $self->{column}++;
1406     $self->{nc}
1407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1408     } else {
1409     $self->{set_nc}->($self);
1410     }
1411    
1412     redo A;
1413     } elsif ($self->{nc} == -1) {
1414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1415     $before_leave->();
1416     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1417    
1418     $self->{last_stag_name} = $self->{ct}->{tag_name};
1419     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1420     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1421     if ($self->{ct}->{attributes}) {
1422    
1423     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1424     } else {
1425     ## NOTE: This state should never be reached.
1426    
1427     }
1428     } else {
1429     die "$0: $self->{ct}->{type}: Unknown token type";
1430     }
1431     $self->{state} = DATA_STATE;
1432 wakaba 1.5 $self->{s_kwd} = '';
1433 wakaba 1.1 # reconsume
1434    
1435 wakaba 1.33 ## Discard the token.
1436     #return ($self->{ct}); # start tag or end tag
1437 wakaba 1.1
1438     redo A;
1439     } else {
1440 wakaba 1.30 if ({
1441     0x0022 => 1, # "
1442     0x0027 => 1, # '
1443     0x003C => 1, # <
1444     }->{$self->{nc}}) {
1445 wakaba 1.1
1446 wakaba 1.11 ## XML5: Not a parse error.
1447 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1448     } else {
1449    
1450     }
1451     $self->{ca}->{name} .= chr ($self->{nc});
1452     ## Stay in the state
1453    
1454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1455     $self->{line_prev} = $self->{line};
1456     $self->{column_prev} = $self->{column};
1457     $self->{column}++;
1458     $self->{nc}
1459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1460     } else {
1461     $self->{set_nc}->($self);
1462     }
1463    
1464     redo A;
1465     }
1466     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1467 wakaba 1.11 ## XML5: "Tag attribute name after state".
1468    
1469 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1470    
1471     ## Stay in the state
1472    
1473     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1474     $self->{line_prev} = $self->{line};
1475     $self->{column_prev} = $self->{column};
1476     $self->{column}++;
1477     $self->{nc}
1478     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1479     } else {
1480     $self->{set_nc}->($self);
1481     }
1482    
1483     redo A;
1484     } elsif ($self->{nc} == 0x003D) { # =
1485    
1486     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1487    
1488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1489     $self->{line_prev} = $self->{line};
1490     $self->{column_prev} = $self->{column};
1491     $self->{column}++;
1492     $self->{nc}
1493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1494     } else {
1495     $self->{set_nc}->($self);
1496     }
1497    
1498     redo A;
1499     } elsif ($self->{nc} == 0x003E) { # >
1500 wakaba 1.11 if ($self->{is_xml}) {
1501    
1502     ## XML5: Not a parse error.
1503     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1504     } else {
1505    
1506     }
1507    
1508 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1509    
1510     $self->{last_stag_name} = $self->{ct}->{tag_name};
1511     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1512     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1513     if ($self->{ct}->{attributes}) {
1514    
1515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1516     } else {
1517     ## NOTE: This state should never be reached.
1518    
1519     }
1520     } else {
1521     die "$0: $self->{ct}->{type}: Unknown token type";
1522     }
1523     $self->{state} = DATA_STATE;
1524 wakaba 1.5 $self->{s_kwd} = '';
1525 wakaba 1.1
1526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1527     $self->{line_prev} = $self->{line};
1528     $self->{column_prev} = $self->{column};
1529     $self->{column}++;
1530     $self->{nc}
1531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1532     } else {
1533     $self->{set_nc}->($self);
1534     }
1535    
1536    
1537     return ($self->{ct}); # start tag or end tag
1538    
1539     redo A;
1540     } elsif (0x0041 <= $self->{nc} and
1541     $self->{nc} <= 0x005A) { # A..Z
1542    
1543     $self->{ca}
1544 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1545 wakaba 1.1 value => '',
1546     line => $self->{line}, column => $self->{column}};
1547     $self->{state} = ATTRIBUTE_NAME_STATE;
1548    
1549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1550     $self->{line_prev} = $self->{line};
1551     $self->{column_prev} = $self->{column};
1552     $self->{column}++;
1553     $self->{nc}
1554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1555     } else {
1556     $self->{set_nc}->($self);
1557     }
1558    
1559     redo A;
1560     } elsif ($self->{nc} == 0x002F) { # /
1561 wakaba 1.11 if ($self->{is_xml}) {
1562    
1563     ## XML5: Not a parse error.
1564     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1565     } else {
1566    
1567     }
1568 wakaba 1.1
1569     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1570    
1571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1572     $self->{line_prev} = $self->{line};
1573     $self->{column_prev} = $self->{column};
1574     $self->{column}++;
1575     $self->{nc}
1576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1577     } else {
1578     $self->{set_nc}->($self);
1579     }
1580    
1581     redo A;
1582     } elsif ($self->{nc} == -1) {
1583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1584     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1585    
1586     $self->{last_stag_name} = $self->{ct}->{tag_name};
1587     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1588     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1589     if ($self->{ct}->{attributes}) {
1590    
1591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1592     } else {
1593     ## NOTE: This state should never be reached.
1594    
1595     }
1596     } else {
1597     die "$0: $self->{ct}->{type}: Unknown token type";
1598     }
1599 wakaba 1.5 $self->{s_kwd} = '';
1600 wakaba 1.1 $self->{state} = DATA_STATE;
1601     # reconsume
1602    
1603 wakaba 1.33 ## Discard the token.
1604     #return ($self->{ct}); # start tag or end tag
1605 wakaba 1.1
1606     redo A;
1607     } else {
1608 wakaba 1.11 if ($self->{is_xml}) {
1609    
1610     ## XML5: Not a parse error.
1611     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1612     } else {
1613    
1614     }
1615    
1616 wakaba 1.30 if ({
1617     0x0022 => 1, # "
1618     0x0027 => 1, # '
1619     0x003C => 1, # <
1620     }->{$self->{nc}}) {
1621 wakaba 1.1
1622 wakaba 1.11 ## XML5: Not a parse error.
1623 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1624     } else {
1625    
1626     }
1627     $self->{ca}
1628     = {name => chr ($self->{nc}),
1629     value => '',
1630     line => $self->{line}, column => $self->{column}};
1631     $self->{state} = ATTRIBUTE_NAME_STATE;
1632    
1633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1634     $self->{line_prev} = $self->{line};
1635     $self->{column_prev} = $self->{column};
1636     $self->{column}++;
1637     $self->{nc}
1638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1639     } else {
1640     $self->{set_nc}->($self);
1641     }
1642    
1643     redo A;
1644     }
1645     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1646 wakaba 1.11 ## XML5: "Tag attribute value before state".
1647    
1648 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1649    
1650     ## Stay in the state
1651    
1652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1653     $self->{line_prev} = $self->{line};
1654     $self->{column_prev} = $self->{column};
1655     $self->{column}++;
1656     $self->{nc}
1657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1658     } else {
1659     $self->{set_nc}->($self);
1660     }
1661    
1662     redo A;
1663     } elsif ($self->{nc} == 0x0022) { # "
1664    
1665     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1666    
1667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1668     $self->{line_prev} = $self->{line};
1669     $self->{column_prev} = $self->{column};
1670     $self->{column}++;
1671     $self->{nc}
1672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1673     } else {
1674     $self->{set_nc}->($self);
1675     }
1676    
1677     redo A;
1678     } elsif ($self->{nc} == 0x0026) { # &
1679    
1680     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1681     ## reconsume
1682     redo A;
1683     } elsif ($self->{nc} == 0x0027) { # '
1684    
1685     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1686    
1687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688     $self->{line_prev} = $self->{line};
1689     $self->{column_prev} = $self->{column};
1690     $self->{column}++;
1691     $self->{nc}
1692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693     } else {
1694     $self->{set_nc}->($self);
1695     }
1696    
1697     redo A;
1698     } elsif ($self->{nc} == 0x003E) { # >
1699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1700     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1701    
1702     $self->{last_stag_name} = $self->{ct}->{tag_name};
1703     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1704     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1705     if ($self->{ct}->{attributes}) {
1706    
1707     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1708     } else {
1709     ## NOTE: This state should never be reached.
1710    
1711     }
1712     } else {
1713     die "$0: $self->{ct}->{type}: Unknown token type";
1714     }
1715     $self->{state} = DATA_STATE;
1716 wakaba 1.5 $self->{s_kwd} = '';
1717 wakaba 1.1
1718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1719     $self->{line_prev} = $self->{line};
1720     $self->{column_prev} = $self->{column};
1721     $self->{column}++;
1722     $self->{nc}
1723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1724     } else {
1725     $self->{set_nc}->($self);
1726     }
1727    
1728    
1729     return ($self->{ct}); # start tag or end tag
1730    
1731     redo A;
1732     } elsif ($self->{nc} == -1) {
1733     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1734     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1735    
1736     $self->{last_stag_name} = $self->{ct}->{tag_name};
1737     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1738     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1739     if ($self->{ct}->{attributes}) {
1740    
1741     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1742     } else {
1743     ## NOTE: This state should never be reached.
1744    
1745     }
1746     } else {
1747     die "$0: $self->{ct}->{type}: Unknown token type";
1748     }
1749     $self->{state} = DATA_STATE;
1750 wakaba 1.5 $self->{s_kwd} = '';
1751 wakaba 1.1 ## reconsume
1752    
1753 wakaba 1.33 ## Discard the token.
1754     #return ($self->{ct}); # start tag or end tag
1755 wakaba 1.1
1756     redo A;
1757     } else {
1758 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1759 wakaba 1.1
1760 wakaba 1.11 ## XML5: Not a parse error.
1761 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1762 wakaba 1.11 } elsif ($self->{is_xml}) {
1763    
1764     ## XML5: No parse error.
1765     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1766 wakaba 1.1 } else {
1767    
1768     }
1769     $self->{ca}->{value} .= chr ($self->{nc});
1770     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1771    
1772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1773     $self->{line_prev} = $self->{line};
1774     $self->{column_prev} = $self->{column};
1775     $self->{column}++;
1776     $self->{nc}
1777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1778     } else {
1779     $self->{set_nc}->($self);
1780     }
1781    
1782     redo A;
1783     }
1784     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1785 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1786     ## ATTLIST attribute value double quoted state".
1787 wakaba 1.11
1788 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1789 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1790    
1791     ## XML5: "DOCTYPE ATTLIST name after state".
1792     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1793     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1794     } else {
1795    
1796     ## XML5: "Tag attribute name before state".
1797     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1798     }
1799 wakaba 1.1
1800     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1801     $self->{line_prev} = $self->{line};
1802     $self->{column_prev} = $self->{column};
1803     $self->{column}++;
1804     $self->{nc}
1805     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1806     } else {
1807     $self->{set_nc}->($self);
1808     }
1809    
1810     redo A;
1811     } elsif ($self->{nc} == 0x0026) { # &
1812    
1813 wakaba 1.11 ## XML5: Not defined yet.
1814    
1815 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1816     ## "entity in attribute value state". In this implementation, the
1817     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1818     ## implementation of the "consume a character reference" algorithm.
1819     $self->{prev_state} = $self->{state};
1820     $self->{entity_add} = 0x0022; # "
1821     $self->{state} = ENTITY_STATE;
1822    
1823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1824     $self->{line_prev} = $self->{line};
1825     $self->{column_prev} = $self->{column};
1826     $self->{column}++;
1827     $self->{nc}
1828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1829     } else {
1830     $self->{set_nc}->($self);
1831     }
1832    
1833     redo A;
1834 wakaba 1.25 } elsif ($self->{is_xml} and
1835     $is_space->{$self->{nc}}) {
1836    
1837     $self->{ca}->{value} .= ' ';
1838     ## Stay in the state.
1839    
1840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1841     $self->{line_prev} = $self->{line};
1842     $self->{column_prev} = $self->{column};
1843     $self->{column}++;
1844     $self->{nc}
1845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1846     } else {
1847     $self->{set_nc}->($self);
1848     }
1849    
1850     redo A;
1851 wakaba 1.1 } elsif ($self->{nc} == -1) {
1852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1853     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1854    
1855     $self->{last_stag_name} = $self->{ct}->{tag_name};
1856 wakaba 1.15
1857     $self->{state} = DATA_STATE;
1858     $self->{s_kwd} = '';
1859     ## reconsume
1860     return ($self->{ct}); # start tag
1861     redo A;
1862 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1863     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1864     if ($self->{ct}->{attributes}) {
1865    
1866     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1867     } else {
1868     ## NOTE: This state should never be reached.
1869    
1870     }
1871 wakaba 1.15
1872     $self->{state} = DATA_STATE;
1873     $self->{s_kwd} = '';
1874     ## reconsume
1875 wakaba 1.33
1876     ## Discard the token.
1877     #return ($self->{ct}); # end tag
1878    
1879 wakaba 1.15 redo A;
1880     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1881     ## XML5: No parse error above; not defined yet.
1882     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1883     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1884     ## Reconsume.
1885 wakaba 1.33
1886     ## Discard the token.
1887     #return ($self->{ct}); # ATTLIST
1888    
1889 wakaba 1.15 redo A;
1890 wakaba 1.1 } else {
1891     die "$0: $self->{ct}->{type}: Unknown token type";
1892     }
1893     } else {
1894 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1895 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1896    
1897     ## XML5: Not a parse error.
1898     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1899     } else {
1900    
1901     }
1902 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1903     $self->{read_until}->($self->{ca}->{value},
1904 wakaba 1.25 qq["&<\x09\x0C\x20],
1905 wakaba 1.1 length $self->{ca}->{value});
1906    
1907     ## Stay in the state
1908    
1909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1910     $self->{line_prev} = $self->{line};
1911     $self->{column_prev} = $self->{column};
1912     $self->{column}++;
1913     $self->{nc}
1914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1915     } else {
1916     $self->{set_nc}->($self);
1917     }
1918    
1919     redo A;
1920     }
1921     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1922 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1923     ## ATTLIST attribute value single quoted state".
1924 wakaba 1.11
1925 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1926 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1927    
1928     ## XML5: "DOCTYPE ATTLIST name after state".
1929     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1930     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1931     } else {
1932    
1933     ## XML5: "Before attribute name state" (sic).
1934     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1935     }
1936 wakaba 1.1
1937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938     $self->{line_prev} = $self->{line};
1939     $self->{column_prev} = $self->{column};
1940     $self->{column}++;
1941     $self->{nc}
1942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943     } else {
1944     $self->{set_nc}->($self);
1945     }
1946    
1947     redo A;
1948     } elsif ($self->{nc} == 0x0026) { # &
1949    
1950 wakaba 1.11 ## XML5: Not defined yet.
1951    
1952 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1953     ## "entity in attribute value state". In this implementation, the
1954     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1955     ## implementation of the "consume a character reference" algorithm.
1956     $self->{entity_add} = 0x0027; # '
1957     $self->{prev_state} = $self->{state};
1958     $self->{state} = ENTITY_STATE;
1959    
1960     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1961     $self->{line_prev} = $self->{line};
1962     $self->{column_prev} = $self->{column};
1963     $self->{column}++;
1964     $self->{nc}
1965     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1966     } else {
1967     $self->{set_nc}->($self);
1968     }
1969    
1970     redo A;
1971 wakaba 1.25 } elsif ($self->{is_xml} and
1972     $is_space->{$self->{nc}}) {
1973    
1974     $self->{ca}->{value} .= ' ';
1975     ## Stay in the state.
1976    
1977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1978     $self->{line_prev} = $self->{line};
1979     $self->{column_prev} = $self->{column};
1980     $self->{column}++;
1981     $self->{nc}
1982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1983     } else {
1984     $self->{set_nc}->($self);
1985     }
1986    
1987     redo A;
1988 wakaba 1.1 } elsif ($self->{nc} == -1) {
1989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1990     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1991    
1992     $self->{last_stag_name} = $self->{ct}->{tag_name};
1993 wakaba 1.15
1994     $self->{state} = DATA_STATE;
1995     $self->{s_kwd} = '';
1996     ## reconsume
1997 wakaba 1.33
1998     ## Discard the token.
1999     #return ($self->{ct}); # start tag
2000    
2001 wakaba 1.15 redo A;
2002 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2003     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2004     if ($self->{ct}->{attributes}) {
2005    
2006     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2007     } else {
2008     ## NOTE: This state should never be reached.
2009    
2010     }
2011 wakaba 1.15
2012     $self->{state} = DATA_STATE;
2013     $self->{s_kwd} = '';
2014     ## reconsume
2015 wakaba 1.33
2016     ## Discard the token.
2017     #return ($self->{ct}); # end tag
2018    
2019 wakaba 1.15 redo A;
2020     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2021     ## XML5: No parse error above; not defined yet.
2022     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2023     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2024     ## Reconsume.
2025 wakaba 1.33
2026     ## Discard the token.
2027     #return ($self->{ct}); # ATTLIST
2028    
2029 wakaba 1.15 redo A;
2030 wakaba 1.1 } else {
2031     die "$0: $self->{ct}->{type}: Unknown token type";
2032     }
2033     } else {
2034 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2035 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2036    
2037     ## XML5: Not a parse error.
2038     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2039     } else {
2040    
2041     }
2042 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2043     $self->{read_until}->($self->{ca}->{value},
2044 wakaba 1.25 qq['&<\x09\x0C\x20],
2045 wakaba 1.1 length $self->{ca}->{value});
2046    
2047     ## Stay in the state
2048    
2049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2050     $self->{line_prev} = $self->{line};
2051     $self->{column_prev} = $self->{column};
2052     $self->{column}++;
2053     $self->{nc}
2054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2055     } else {
2056     $self->{set_nc}->($self);
2057     }
2058    
2059     redo A;
2060     }
2061     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2062 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2063    
2064 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2065 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2066    
2067     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2068     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2069     } else {
2070    
2071     ## XML5: "Tag attribute name before state".
2072     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2073     }
2074 wakaba 1.1
2075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2076     $self->{line_prev} = $self->{line};
2077     $self->{column_prev} = $self->{column};
2078     $self->{column}++;
2079     $self->{nc}
2080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2081     } else {
2082     $self->{set_nc}->($self);
2083     }
2084    
2085     redo A;
2086     } elsif ($self->{nc} == 0x0026) { # &
2087    
2088 wakaba 1.11
2089     ## XML5: Not defined yet.
2090    
2091 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2092     ## "entity in attribute value state". In this implementation, the
2093     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2094     ## implementation of the "consume a character reference" algorithm.
2095     $self->{entity_add} = -1;
2096     $self->{prev_state} = $self->{state};
2097     $self->{state} = ENTITY_STATE;
2098    
2099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2100     $self->{line_prev} = $self->{line};
2101     $self->{column_prev} = $self->{column};
2102     $self->{column}++;
2103     $self->{nc}
2104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2105     } else {
2106     $self->{set_nc}->($self);
2107     }
2108    
2109     redo A;
2110     } elsif ($self->{nc} == 0x003E) { # >
2111     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112    
2113     $self->{last_stag_name} = $self->{ct}->{tag_name};
2114 wakaba 1.15
2115     $self->{state} = DATA_STATE;
2116     $self->{s_kwd} = '';
2117    
2118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2119     $self->{line_prev} = $self->{line};
2120     $self->{column_prev} = $self->{column};
2121     $self->{column}++;
2122     $self->{nc}
2123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2124     } else {
2125     $self->{set_nc}->($self);
2126     }
2127    
2128     return ($self->{ct}); # start tag
2129     redo A;
2130 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2131     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2132     if ($self->{ct}->{attributes}) {
2133    
2134     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2135     } else {
2136     ## NOTE: This state should never be reached.
2137    
2138     }
2139 wakaba 1.15
2140     $self->{state} = DATA_STATE;
2141     $self->{s_kwd} = '';
2142    
2143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2144     $self->{line_prev} = $self->{line};
2145     $self->{column_prev} = $self->{column};
2146     $self->{column}++;
2147     $self->{nc}
2148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2149     } else {
2150     $self->{set_nc}->($self);
2151     }
2152    
2153     return ($self->{ct}); # end tag
2154     redo A;
2155     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2156     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2158    
2159 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160     $self->{line_prev} = $self->{line};
2161     $self->{column_prev} = $self->{column};
2162     $self->{column}++;
2163     $self->{nc}
2164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165     } else {
2166     $self->{set_nc}->($self);
2167     }
2168    
2169 wakaba 1.15 return ($self->{ct}); # ATTLIST
2170     redo A;
2171     } else {
2172     die "$0: $self->{ct}->{type}: Unknown token type";
2173     }
2174 wakaba 1.1 } elsif ($self->{nc} == -1) {
2175     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2176    
2177 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2178 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2179 wakaba 1.15
2180     $self->{state} = DATA_STATE;
2181     $self->{s_kwd} = '';
2182     ## reconsume
2183 wakaba 1.33
2184     ## Discard the token.
2185     #return ($self->{ct}); # start tag
2186    
2187 wakaba 1.15 redo A;
2188 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2190 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2191     if ($self->{ct}->{attributes}) {
2192    
2193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2194     } else {
2195     ## NOTE: This state should never be reached.
2196    
2197     }
2198 wakaba 1.15
2199     $self->{state} = DATA_STATE;
2200     $self->{s_kwd} = '';
2201     ## reconsume
2202 wakaba 1.33
2203     ## Discard the token.
2204     #return ($self->{ct}); # end tag
2205    
2206 wakaba 1.15 redo A;
2207     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2209     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2210     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2211     ## Reconsume.
2212 wakaba 1.33
2213     ## Discard the token.
2214     #return ($self->{ct}); # ATTLIST
2215    
2216 wakaba 1.15 redo A;
2217 wakaba 1.1 } else {
2218     die "$0: $self->{ct}->{type}: Unknown token type";
2219     }
2220     } else {
2221     if ({
2222     0x0022 => 1, # "
2223     0x0027 => 1, # '
2224     0x003D => 1, # =
2225 wakaba 1.26 0x003C => 1, # <
2226 wakaba 1.1 }->{$self->{nc}}) {
2227    
2228 wakaba 1.11 ## XML5: Not a parse error.
2229 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2230     } else {
2231    
2232     }
2233     $self->{ca}->{value} .= chr ($self->{nc});
2234     $self->{read_until}->($self->{ca}->{value},
2235 wakaba 1.25 qq["'=& \x09\x0C>],
2236 wakaba 1.1 length $self->{ca}->{value});
2237    
2238     ## Stay in the state
2239    
2240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2241     $self->{line_prev} = $self->{line};
2242     $self->{column_prev} = $self->{column};
2243     $self->{column}++;
2244     $self->{nc}
2245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2246     } else {
2247     $self->{set_nc}->($self);
2248     }
2249    
2250     redo A;
2251     }
2252     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2253     if ($is_space->{$self->{nc}}) {
2254    
2255     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2256    
2257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258     $self->{line_prev} = $self->{line};
2259     $self->{column_prev} = $self->{column};
2260     $self->{column}++;
2261     $self->{nc}
2262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263     } else {
2264     $self->{set_nc}->($self);
2265     }
2266    
2267     redo A;
2268     } elsif ($self->{nc} == 0x003E) { # >
2269     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2270    
2271     $self->{last_stag_name} = $self->{ct}->{tag_name};
2272     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2273     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2274     if ($self->{ct}->{attributes}) {
2275    
2276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2277     } else {
2278     ## NOTE: This state should never be reached.
2279    
2280     }
2281     } else {
2282     die "$0: $self->{ct}->{type}: Unknown token type";
2283     }
2284     $self->{state} = DATA_STATE;
2285 wakaba 1.5 $self->{s_kwd} = '';
2286 wakaba 1.1
2287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2288     $self->{line_prev} = $self->{line};
2289     $self->{column_prev} = $self->{column};
2290     $self->{column}++;
2291     $self->{nc}
2292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2293     } else {
2294     $self->{set_nc}->($self);
2295     }
2296    
2297    
2298     return ($self->{ct}); # start tag or end tag
2299    
2300     redo A;
2301     } elsif ($self->{nc} == 0x002F) { # /
2302    
2303     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2304    
2305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2306     $self->{line_prev} = $self->{line};
2307     $self->{column_prev} = $self->{column};
2308     $self->{column}++;
2309     $self->{nc}
2310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2311     } else {
2312     $self->{set_nc}->($self);
2313     }
2314    
2315     redo A;
2316     } elsif ($self->{nc} == -1) {
2317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2318     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2319    
2320     $self->{last_stag_name} = $self->{ct}->{tag_name};
2321     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2322     if ($self->{ct}->{attributes}) {
2323    
2324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2325     } else {
2326     ## NOTE: This state should never be reached.
2327    
2328     }
2329     } else {
2330     die "$0: $self->{ct}->{type}: Unknown token type";
2331     }
2332     $self->{state} = DATA_STATE;
2333 wakaba 1.5 $self->{s_kwd} = '';
2334 wakaba 1.1 ## Reconsume.
2335 wakaba 1.33
2336     ## Discard the token.
2337     #return ($self->{ct}); # start tag or end tag
2338    
2339 wakaba 1.1 redo A;
2340     } else {
2341    
2342     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2343     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2344     ## reconsume
2345     redo A;
2346     }
2347     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2348 wakaba 1.11 ## XML5: "Empty tag state".
2349    
2350 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2351     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2352    
2353     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2354     ## TODO: Different type than slash in start tag
2355     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2356     if ($self->{ct}->{attributes}) {
2357    
2358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2359     } else {
2360    
2361     }
2362     ## TODO: Test |<title></title/>|
2363     } else {
2364    
2365     $self->{self_closing} = 1;
2366     }
2367    
2368     $self->{state} = DATA_STATE;
2369 wakaba 1.5 $self->{s_kwd} = '';
2370 wakaba 1.1
2371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2372     $self->{line_prev} = $self->{line};
2373     $self->{column_prev} = $self->{column};
2374     $self->{column}++;
2375     $self->{nc}
2376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2377     } else {
2378     $self->{set_nc}->($self);
2379     }
2380    
2381    
2382     return ($self->{ct}); # start tag or end tag
2383    
2384     redo A;
2385     } elsif ($self->{nc} == -1) {
2386     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2387     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2388    
2389     $self->{last_stag_name} = $self->{ct}->{tag_name};
2390     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2391     if ($self->{ct}->{attributes}) {
2392    
2393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2394     } else {
2395     ## NOTE: This state should never be reached.
2396    
2397     }
2398     } else {
2399     die "$0: $self->{ct}->{type}: Unknown token type";
2400     }
2401 wakaba 1.11 ## XML5: "Tag attribute name before state".
2402 wakaba 1.1 $self->{state} = DATA_STATE;
2403 wakaba 1.5 $self->{s_kwd} = '';
2404 wakaba 1.1 ## Reconsume.
2405 wakaba 1.33
2406     ## Discard the token.
2407     #return ($self->{ct}); # start tag or end tag
2408    
2409 wakaba 1.1 redo A;
2410     } else {
2411    
2412     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2413     ## TODO: This error type is wrong.
2414     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2415     ## Reconsume.
2416     redo A;
2417     }
2418     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2419 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2420    
2421 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2422     ## consumes characters one-by-one basis.
2423    
2424     if ($self->{nc} == 0x003E) { # >
2425 wakaba 1.13 if ($self->{in_subset}) {
2426    
2427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2428     } else {
2429    
2430     $self->{state} = DATA_STATE;
2431     $self->{s_kwd} = '';
2432     }
2433 wakaba 1.1
2434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2435     $self->{line_prev} = $self->{line};
2436     $self->{column_prev} = $self->{column};
2437     $self->{column}++;
2438     $self->{nc}
2439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2440     } else {
2441     $self->{set_nc}->($self);
2442     }
2443    
2444    
2445     return ($self->{ct}); # comment
2446     redo A;
2447     } elsif ($self->{nc} == -1) {
2448 wakaba 1.13 if ($self->{in_subset}) {
2449    
2450     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451     } else {
2452    
2453     $self->{state} = DATA_STATE;
2454     $self->{s_kwd} = '';
2455     }
2456 wakaba 1.1 ## reconsume
2457    
2458     return ($self->{ct}); # comment
2459     redo A;
2460     } else {
2461    
2462     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2463     $self->{read_until}->($self->{ct}->{data},
2464     q[>],
2465     length $self->{ct}->{data});
2466    
2467     ## Stay in the state.
2468    
2469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470     $self->{line_prev} = $self->{line};
2471     $self->{column_prev} = $self->{column};
2472     $self->{column}++;
2473     $self->{nc}
2474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475     } else {
2476     $self->{set_nc}->($self);
2477     }
2478    
2479     redo A;
2480     }
2481     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2482 wakaba 1.14 ## XML5: "Markup declaration state".
2483 wakaba 1.1
2484     if ($self->{nc} == 0x002D) { # -
2485    
2486     $self->{state} = MD_HYPHEN_STATE;
2487    
2488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2489     $self->{line_prev} = $self->{line};
2490     $self->{column_prev} = $self->{column};
2491     $self->{column}++;
2492     $self->{nc}
2493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2494     } else {
2495     $self->{set_nc}->($self);
2496     }
2497    
2498     redo A;
2499     } elsif ($self->{nc} == 0x0044 or # D
2500     $self->{nc} == 0x0064) { # d
2501     ## ASCII case-insensitive.
2502    
2503     $self->{state} = MD_DOCTYPE_STATE;
2504 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2505 wakaba 1.1
2506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2507     $self->{line_prev} = $self->{line};
2508     $self->{column_prev} = $self->{column};
2509     $self->{column}++;
2510     $self->{nc}
2511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2512     } else {
2513     $self->{set_nc}->($self);
2514     }
2515    
2516     redo A;
2517 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2518     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2519     $self->{is_xml}) and
2520 wakaba 1.1 $self->{nc} == 0x005B) { # [
2521    
2522     $self->{state} = MD_CDATA_STATE;
2523 wakaba 1.12 $self->{kwd} = '[';
2524 wakaba 1.1
2525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2526     $self->{line_prev} = $self->{line};
2527     $self->{column_prev} = $self->{column};
2528     $self->{column}++;
2529     $self->{nc}
2530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2531     } else {
2532     $self->{set_nc}->($self);
2533     }
2534    
2535     redo A;
2536     } else {
2537    
2538     }
2539    
2540     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2541     line => $self->{line_prev},
2542     column => $self->{column_prev} - 1);
2543     ## Reconsume.
2544     $self->{state} = BOGUS_COMMENT_STATE;
2545     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2546     line => $self->{line_prev},
2547     column => $self->{column_prev} - 1,
2548     };
2549     redo A;
2550     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2551     if ($self->{nc} == 0x002D) { # -
2552    
2553     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2554     line => $self->{line_prev},
2555     column => $self->{column_prev} - 2,
2556     };
2557 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2558 wakaba 1.1
2559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2560     $self->{line_prev} = $self->{line};
2561     $self->{column_prev} = $self->{column};
2562     $self->{column}++;
2563     $self->{nc}
2564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2565     } else {
2566     $self->{set_nc}->($self);
2567     }
2568    
2569     redo A;
2570     } else {
2571    
2572     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2573     line => $self->{line_prev},
2574     column => $self->{column_prev} - 2);
2575     $self->{state} = BOGUS_COMMENT_STATE;
2576     ## Reconsume.
2577     $self->{ct} = {type => COMMENT_TOKEN,
2578     data => '-',
2579     line => $self->{line_prev},
2580     column => $self->{column_prev} - 2,
2581     };
2582     redo A;
2583     }
2584     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2585     ## ASCII case-insensitive.
2586     if ($self->{nc} == [
2587     undef,
2588     0x004F, # O
2589     0x0043, # C
2590     0x0054, # T
2591     0x0059, # Y
2592     0x0050, # P
2593 wakaba 1.12 ]->[length $self->{kwd}] or
2594 wakaba 1.1 $self->{nc} == [
2595     undef,
2596     0x006F, # o
2597     0x0063, # c
2598     0x0074, # t
2599     0x0079, # y
2600     0x0070, # p
2601 wakaba 1.12 ]->[length $self->{kwd}]) {
2602 wakaba 1.1
2603     ## Stay in the state.
2604 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2605 wakaba 1.1
2606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2607     $self->{line_prev} = $self->{line};
2608     $self->{column_prev} = $self->{column};
2609     $self->{column}++;
2610     $self->{nc}
2611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2612     } else {
2613     $self->{set_nc}->($self);
2614     }
2615    
2616     redo A;
2617 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2618 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2619     $self->{nc} == 0x0065)) { # e
2620 wakaba 1.12 if ($self->{is_xml} and
2621     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2622 wakaba 1.10
2623     ## XML5: case-sensitive.
2624     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2625     text => 'DOCTYPE',
2626     line => $self->{line_prev},
2627     column => $self->{column_prev} - 5);
2628     } else {
2629    
2630     }
2631 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2632     $self->{ct} = {type => DOCTYPE_TOKEN,
2633     quirks => 1,
2634     line => $self->{line_prev},
2635     column => $self->{column_prev} - 7,
2636     };
2637    
2638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639     $self->{line_prev} = $self->{line};
2640     $self->{column_prev} = $self->{column};
2641     $self->{column}++;
2642     $self->{nc}
2643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644     } else {
2645     $self->{set_nc}->($self);
2646     }
2647    
2648     redo A;
2649     } else {
2650    
2651     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2652     line => $self->{line_prev},
2653 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2654 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2655     ## Reconsume.
2656     $self->{ct} = {type => COMMENT_TOKEN,
2657 wakaba 1.12 data => $self->{kwd},
2658 wakaba 1.1 line => $self->{line_prev},
2659 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2660 wakaba 1.1 };
2661     redo A;
2662     }
2663     } elsif ($self->{state} == MD_CDATA_STATE) {
2664     if ($self->{nc} == {
2665     '[' => 0x0043, # C
2666     '[C' => 0x0044, # D
2667     '[CD' => 0x0041, # A
2668     '[CDA' => 0x0054, # T
2669     '[CDAT' => 0x0041, # A
2670 wakaba 1.12 }->{$self->{kwd}}) {
2671 wakaba 1.1
2672     ## Stay in the state.
2673 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2674 wakaba 1.1
2675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2676     $self->{line_prev} = $self->{line};
2677     $self->{column_prev} = $self->{column};
2678     $self->{column}++;
2679     $self->{nc}
2680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2681     } else {
2682     $self->{set_nc}->($self);
2683     }
2684    
2685     redo A;
2686 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2687 wakaba 1.1 $self->{nc} == 0x005B) { # [
2688 wakaba 1.6 if ($self->{is_xml} and
2689     not $self->{tainted} and
2690     @{$self->{open_elements} or []} == 0) {
2691 wakaba 1.8
2692 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2693     line => $self->{line_prev},
2694     column => $self->{column_prev} - 7);
2695     $self->{tainted} = 1;
2696 wakaba 1.8 } else {
2697    
2698 wakaba 1.6 }
2699    
2700 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2701     data => '',
2702     line => $self->{line_prev},
2703     column => $self->{column_prev} - 7};
2704     $self->{state} = CDATA_SECTION_STATE;
2705    
2706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2707     $self->{line_prev} = $self->{line};
2708     $self->{column_prev} = $self->{column};
2709     $self->{column}++;
2710     $self->{nc}
2711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2712     } else {
2713     $self->{set_nc}->($self);
2714     }
2715    
2716     redo A;
2717     } else {
2718    
2719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2720     line => $self->{line_prev},
2721 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2722 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2723     ## Reconsume.
2724     $self->{ct} = {type => COMMENT_TOKEN,
2725 wakaba 1.12 data => $self->{kwd},
2726 wakaba 1.1 line => $self->{line_prev},
2727 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2728 wakaba 1.1 };
2729     redo A;
2730     }
2731     } elsif ($self->{state} == COMMENT_START_STATE) {
2732     if ($self->{nc} == 0x002D) { # -
2733    
2734     $self->{state} = COMMENT_START_DASH_STATE;
2735    
2736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2737     $self->{line_prev} = $self->{line};
2738     $self->{column_prev} = $self->{column};
2739     $self->{column}++;
2740     $self->{nc}
2741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2742     } else {
2743     $self->{set_nc}->($self);
2744     }
2745    
2746     redo A;
2747     } elsif ($self->{nc} == 0x003E) { # >
2748     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2749 wakaba 1.13 if ($self->{in_subset}) {
2750    
2751     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2752     } else {
2753    
2754     $self->{state} = DATA_STATE;
2755     $self->{s_kwd} = '';
2756     }
2757 wakaba 1.1
2758     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2759     $self->{line_prev} = $self->{line};
2760     $self->{column_prev} = $self->{column};
2761     $self->{column}++;
2762     $self->{nc}
2763     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2764     } else {
2765     $self->{set_nc}->($self);
2766     }
2767    
2768    
2769     return ($self->{ct}); # comment
2770    
2771     redo A;
2772     } elsif ($self->{nc} == -1) {
2773     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2774 wakaba 1.13 if ($self->{in_subset}) {
2775    
2776     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2777     } else {
2778    
2779     $self->{state} = DATA_STATE;
2780     $self->{s_kwd} = '';
2781     }
2782 wakaba 1.1 ## reconsume
2783    
2784     return ($self->{ct}); # comment
2785    
2786     redo A;
2787     } else {
2788    
2789     $self->{ct}->{data} # comment
2790     .= chr ($self->{nc});
2791     $self->{state} = COMMENT_STATE;
2792    
2793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2794     $self->{line_prev} = $self->{line};
2795     $self->{column_prev} = $self->{column};
2796     $self->{column}++;
2797     $self->{nc}
2798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2799     } else {
2800     $self->{set_nc}->($self);
2801     }
2802    
2803     redo A;
2804     }
2805     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2806     if ($self->{nc} == 0x002D) { # -
2807    
2808     $self->{state} = COMMENT_END_STATE;
2809    
2810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811     $self->{line_prev} = $self->{line};
2812     $self->{column_prev} = $self->{column};
2813     $self->{column}++;
2814     $self->{nc}
2815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816     } else {
2817     $self->{set_nc}->($self);
2818     }
2819    
2820     redo A;
2821     } elsif ($self->{nc} == 0x003E) { # >
2822     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2823 wakaba 1.13 if ($self->{in_subset}) {
2824    
2825     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2826     } else {
2827    
2828     $self->{state} = DATA_STATE;
2829     $self->{s_kwd} = '';
2830     }
2831 wakaba 1.1
2832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2833     $self->{line_prev} = $self->{line};
2834     $self->{column_prev} = $self->{column};
2835     $self->{column}++;
2836     $self->{nc}
2837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2838     } else {
2839     $self->{set_nc}->($self);
2840     }
2841    
2842    
2843     return ($self->{ct}); # comment
2844    
2845     redo A;
2846     } elsif ($self->{nc} == -1) {
2847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2848 wakaba 1.13 if ($self->{in_subset}) {
2849    
2850     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2851     } else {
2852    
2853     $self->{state} = DATA_STATE;
2854     $self->{s_kwd} = '';
2855     }
2856 wakaba 1.1 ## reconsume
2857    
2858     return ($self->{ct}); # comment
2859    
2860     redo A;
2861     } else {
2862    
2863     $self->{ct}->{data} # comment
2864     .= '-' . chr ($self->{nc});
2865     $self->{state} = COMMENT_STATE;
2866    
2867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2868     $self->{line_prev} = $self->{line};
2869     $self->{column_prev} = $self->{column};
2870     $self->{column}++;
2871     $self->{nc}
2872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2873     } else {
2874     $self->{set_nc}->($self);
2875     }
2876    
2877     redo A;
2878     }
2879     } elsif ($self->{state} == COMMENT_STATE) {
2880 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2881    
2882 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2883    
2884     $self->{state} = COMMENT_END_DASH_STATE;
2885    
2886     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2887     $self->{line_prev} = $self->{line};
2888     $self->{column_prev} = $self->{column};
2889     $self->{column}++;
2890     $self->{nc}
2891     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2892     } else {
2893     $self->{set_nc}->($self);
2894     }
2895    
2896     redo A;
2897     } elsif ($self->{nc} == -1) {
2898     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2899 wakaba 1.13 if ($self->{in_subset}) {
2900    
2901     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902     } else {
2903    
2904     $self->{state} = DATA_STATE;
2905     $self->{s_kwd} = '';
2906     }
2907 wakaba 1.1 ## reconsume
2908    
2909     return ($self->{ct}); # comment
2910    
2911     redo A;
2912     } else {
2913    
2914     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2915     $self->{read_until}->($self->{ct}->{data},
2916     q[-],
2917     length $self->{ct}->{data});
2918    
2919     ## Stay in the state
2920    
2921     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2922     $self->{line_prev} = $self->{line};
2923     $self->{column_prev} = $self->{column};
2924     $self->{column}++;
2925     $self->{nc}
2926     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2927     } else {
2928     $self->{set_nc}->($self);
2929     }
2930    
2931     redo A;
2932     }
2933     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2934 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2935 wakaba 1.10
2936 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2937    
2938     $self->{state} = COMMENT_END_STATE;
2939    
2940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941     $self->{line_prev} = $self->{line};
2942     $self->{column_prev} = $self->{column};
2943     $self->{column}++;
2944     $self->{nc}
2945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946     } else {
2947     $self->{set_nc}->($self);
2948     }
2949    
2950     redo A;
2951     } elsif ($self->{nc} == -1) {
2952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2953 wakaba 1.13 if ($self->{in_subset}) {
2954    
2955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956     } else {
2957    
2958     $self->{state} = DATA_STATE;
2959     $self->{s_kwd} = '';
2960     }
2961 wakaba 1.1 ## reconsume
2962    
2963     return ($self->{ct}); # comment
2964    
2965     redo A;
2966     } else {
2967    
2968     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2969     $self->{state} = COMMENT_STATE;
2970    
2971     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2972     $self->{line_prev} = $self->{line};
2973     $self->{column_prev} = $self->{column};
2974     $self->{column}++;
2975     $self->{nc}
2976     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2977     } else {
2978     $self->{set_nc}->($self);
2979     }
2980    
2981     redo A;
2982     }
2983 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2984     $self->{state} == COMMENT_END_BANG_STATE) {
2985 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2986 wakaba 1.31 ## (No comment end bang state.)
2987 wakaba 1.14
2988 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2989 wakaba 1.13 if ($self->{in_subset}) {
2990    
2991     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2992     } else {
2993    
2994     $self->{state} = DATA_STATE;
2995     $self->{s_kwd} = '';
2996     }
2997 wakaba 1.1
2998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2999     $self->{line_prev} = $self->{line};
3000     $self->{column_prev} = $self->{column};
3001     $self->{column}++;
3002     $self->{nc}
3003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3004     } else {
3005     $self->{set_nc}->($self);
3006     }
3007    
3008    
3009     return ($self->{ct}); # comment
3010    
3011     redo A;
3012     } elsif ($self->{nc} == 0x002D) { # -
3013 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3014    
3015     $self->{ct}->{data} .= '--!'; # comment
3016     $self->{state} = COMMENT_END_DASH_STATE;
3017     } else {
3018    
3019     ## XML5: Not a parse error.
3020     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3021     line => $self->{line_prev},
3022     column => $self->{column_prev});
3023     $self->{ct}->{data} .= '-'; # comment
3024     ## Stay in the state
3025     }
3026 wakaba 1.1
3027 wakaba 1.31 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3028     $self->{line_prev} = $self->{line};
3029     $self->{column_prev} = $self->{column};
3030     $self->{column}++;
3031     $self->{nc}
3032     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3033     } else {
3034     $self->{set_nc}->($self);
3035     }
3036    
3037     redo A;
3038 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3039     $is_space->{$self->{nc}}) {
3040    
3041     $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3042     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3043     $self->{state} = COMMENT_END_SPACE_STATE;
3044    
3045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046     $self->{line_prev} = $self->{line};
3047     $self->{column_prev} = $self->{column};
3048     $self->{column}++;
3049     $self->{nc}
3050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3051     } else {
3052     $self->{set_nc}->($self);
3053     }
3054    
3055     redo A;
3056     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3057     $self->{nc} == 0x0021) { # !
3058    
3059 wakaba 1.31 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3060     $self->{state} = COMMENT_END_BANG_STATE;
3061 wakaba 1.1
3062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3063     $self->{line_prev} = $self->{line};
3064     $self->{column_prev} = $self->{column};
3065     $self->{column}++;
3066     $self->{nc}
3067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3068     } else {
3069     $self->{set_nc}->($self);
3070     }
3071    
3072     redo A;
3073     } elsif ($self->{nc} == -1) {
3074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3075 wakaba 1.13 if ($self->{in_subset}) {
3076    
3077     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3078     } else {
3079    
3080     $self->{state} = DATA_STATE;
3081     $self->{s_kwd} = '';
3082     }
3083 wakaba 1.31 ## Reconsume.
3084 wakaba 1.1
3085     return ($self->{ct}); # comment
3086    
3087     redo A;
3088     } else {
3089    
3090 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3091     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3092     } else {
3093     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3094     }
3095 wakaba 1.1 $self->{state} = COMMENT_STATE;
3096    
3097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3098     $self->{line_prev} = $self->{line};
3099     $self->{column_prev} = $self->{column};
3100     $self->{column}++;
3101     $self->{nc}
3102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3103     } else {
3104     $self->{set_nc}->($self);
3105     }
3106    
3107     redo A;
3108     }
3109 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3110     ## XML5: Not exist.
3111    
3112     if ($self->{nc} == 0x003E) { # >
3113     if ($self->{in_subset}) {
3114    
3115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3116     } else {
3117    
3118     $self->{state} = DATA_STATE;
3119     $self->{s_kwd} = '';
3120     }
3121    
3122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123     $self->{line_prev} = $self->{line};
3124     $self->{column_prev} = $self->{column};
3125     $self->{column}++;
3126     $self->{nc}
3127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128     } else {
3129     $self->{set_nc}->($self);
3130     }
3131    
3132    
3133     return ($self->{ct}); # comment
3134    
3135     redo A;
3136     } elsif ($is_space->{$self->{nc}}) {
3137    
3138     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3139     ## Stay in the state.
3140    
3141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3142     $self->{line_prev} = $self->{line};
3143     $self->{column_prev} = $self->{column};
3144     $self->{column}++;
3145     $self->{nc}
3146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3147     } else {
3148     $self->{set_nc}->($self);
3149     }
3150    
3151     redo A;
3152     } elsif ($self->{nc} == -1) {
3153     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3154     if ($self->{in_subset}) {
3155    
3156     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3157     } else {
3158    
3159     $self->{state} = DATA_STATE;
3160     $self->{s_kwd} = '';
3161     }
3162     ## Reconsume.
3163    
3164     return ($self->{ct}); # comment
3165    
3166     redo A;
3167     } else {
3168    
3169     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3170     $self->{state} = COMMENT_STATE;
3171    
3172     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3173     $self->{line_prev} = $self->{line};
3174     $self->{column_prev} = $self->{column};
3175     $self->{column}++;
3176     $self->{nc}
3177     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3178     } else {
3179     $self->{set_nc}->($self);
3180     }
3181    
3182     redo A;
3183     }
3184 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
3185     if ($is_space->{$self->{nc}}) {
3186    
3187     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3188    
3189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3190     $self->{line_prev} = $self->{line};
3191     $self->{column_prev} = $self->{column};
3192     $self->{column}++;
3193     $self->{nc}
3194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3195     } else {
3196     $self->{set_nc}->($self);
3197     }
3198    
3199     redo A;
3200 wakaba 1.28 } elsif ($self->{nc} == -1) {
3201    
3202     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3203     $self->{ct}->{quirks} = 1;
3204    
3205     $self->{state} = DATA_STATE;
3206     ## Reconsume.
3207     return ($self->{ct}); # DOCTYPE (quirks)
3208    
3209     redo A;
3210 wakaba 1.1 } else {
3211    
3212 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3213 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3214     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3215     ## reconsume
3216     redo A;
3217     }
3218     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3219 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3220    
3221 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3222    
3223     ## Stay in the state
3224    
3225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3226     $self->{line_prev} = $self->{line};
3227     $self->{column_prev} = $self->{column};
3228     $self->{column}++;
3229     $self->{nc}
3230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3231     } else {
3232     $self->{set_nc}->($self);
3233     }
3234    
3235     redo A;
3236     } elsif ($self->{nc} == 0x003E) { # >
3237    
3238 wakaba 1.12 ## XML5: No parse error.
3239 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3240     $self->{state} = DATA_STATE;
3241 wakaba 1.5 $self->{s_kwd} = '';
3242 wakaba 1.1
3243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3244     $self->{line_prev} = $self->{line};
3245     $self->{column_prev} = $self->{column};
3246     $self->{column}++;
3247     $self->{nc}
3248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3249     } else {
3250     $self->{set_nc}->($self);
3251     }
3252    
3253    
3254     return ($self->{ct}); # DOCTYPE (quirks)
3255    
3256     redo A;
3257 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3258    
3259     $self->{ct}->{name} # DOCTYPE
3260     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3261     delete $self->{ct}->{quirks};
3262     $self->{state} = DOCTYPE_NAME_STATE;
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275 wakaba 1.1 } elsif ($self->{nc} == -1) {
3276    
3277     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3278     $self->{state} = DATA_STATE;
3279 wakaba 1.5 $self->{s_kwd} = '';
3280 wakaba 1.1 ## reconsume
3281    
3282     return ($self->{ct}); # DOCTYPE (quirks)
3283    
3284     redo A;
3285 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3286    
3287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3288     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3289 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3290     $self->{in_subset} = 1;
3291 wakaba 1.12
3292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3293     $self->{line_prev} = $self->{line};
3294     $self->{column_prev} = $self->{column};
3295     $self->{column}++;
3296     $self->{nc}
3297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3298     } else {
3299     $self->{set_nc}->($self);
3300     }
3301    
3302 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3303 wakaba 1.12 redo A;
3304 wakaba 1.1 } else {
3305    
3306     $self->{ct}->{name} = chr $self->{nc};
3307     delete $self->{ct}->{quirks};
3308     $self->{state} = DOCTYPE_NAME_STATE;
3309    
3310     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3311     $self->{line_prev} = $self->{line};
3312     $self->{column_prev} = $self->{column};
3313     $self->{column}++;
3314     $self->{nc}
3315     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3316     } else {
3317     $self->{set_nc}->($self);
3318     }
3319    
3320     redo A;
3321     }
3322     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3323 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3324    
3325     ## ISSUE: Redundant "First," in the spec.
3326    
3327 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3328    
3329     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3330    
3331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332     $self->{line_prev} = $self->{line};
3333     $self->{column_prev} = $self->{column};
3334     $self->{column}++;
3335     $self->{nc}
3336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337     } else {
3338     $self->{set_nc}->($self);
3339     }
3340    
3341     redo A;
3342     } elsif ($self->{nc} == 0x003E) { # >
3343    
3344     $self->{state} = DATA_STATE;
3345 wakaba 1.5 $self->{s_kwd} = '';
3346 wakaba 1.1
3347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3348     $self->{line_prev} = $self->{line};
3349     $self->{column_prev} = $self->{column};
3350     $self->{column}++;
3351     $self->{nc}
3352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3353     } else {
3354     $self->{set_nc}->($self);
3355     }
3356    
3357    
3358     return ($self->{ct}); # DOCTYPE
3359    
3360     redo A;
3361 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3362    
3363     $self->{ct}->{name} # DOCTYPE
3364     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3365     delete $self->{ct}->{quirks};
3366     ## Stay in the state.
3367    
3368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3369     $self->{line_prev} = $self->{line};
3370     $self->{column_prev} = $self->{column};
3371     $self->{column}++;
3372     $self->{nc}
3373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3374     } else {
3375     $self->{set_nc}->($self);
3376     }
3377    
3378     redo A;
3379 wakaba 1.1 } elsif ($self->{nc} == -1) {
3380    
3381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3382     $self->{state} = DATA_STATE;
3383 wakaba 1.5 $self->{s_kwd} = '';
3384 wakaba 1.1 ## reconsume
3385    
3386     $self->{ct}->{quirks} = 1;
3387     return ($self->{ct}); # DOCTYPE
3388    
3389     redo A;
3390 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3391    
3392     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3393 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3394     $self->{in_subset} = 1;
3395 wakaba 1.12
3396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3397     $self->{line_prev} = $self->{line};
3398     $self->{column_prev} = $self->{column};
3399     $self->{column}++;
3400     $self->{nc}
3401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3402     } else {
3403     $self->{set_nc}->($self);
3404     }
3405    
3406 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3407 wakaba 1.12 redo A;
3408 wakaba 1.1 } else {
3409    
3410 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3411     ## Stay in the state.
3412 wakaba 1.1
3413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3414     $self->{line_prev} = $self->{line};
3415     $self->{column_prev} = $self->{column};
3416     $self->{column}++;
3417     $self->{nc}
3418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3419     } else {
3420     $self->{set_nc}->($self);
3421     }
3422    
3423     redo A;
3424     }
3425     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3426 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3427     ## state", but implemented differently.
3428    
3429 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3430    
3431     ## Stay in the state
3432    
3433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3434     $self->{line_prev} = $self->{line};
3435     $self->{column_prev} = $self->{column};
3436     $self->{column}++;
3437     $self->{nc}
3438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3439     } else {
3440     $self->{set_nc}->($self);
3441     }
3442    
3443     redo A;
3444     } elsif ($self->{nc} == 0x003E) { # >
3445 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3446    
3447     $self->{state} = DATA_STATE;
3448     $self->{s_kwd} = '';
3449     } else {
3450    
3451     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3452     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3453     }
3454 wakaba 1.1
3455    
3456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3457     $self->{line_prev} = $self->{line};
3458     $self->{column_prev} = $self->{column};
3459     $self->{column}++;
3460     $self->{nc}
3461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3462     } else {
3463     $self->{set_nc}->($self);
3464     }
3465    
3466 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3467 wakaba 1.1 redo A;
3468     } elsif ($self->{nc} == -1) {
3469 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3470    
3471     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3472     $self->{state} = DATA_STATE;
3473     $self->{s_kwd} = '';
3474     $self->{ct}->{quirks} = 1;
3475     } else {
3476    
3477     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3479     }
3480 wakaba 1.1
3481 wakaba 1.16 ## Reconsume.
3482     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3483 wakaba 1.1 redo A;
3484     } elsif ($self->{nc} == 0x0050 or # P
3485     $self->{nc} == 0x0070) { # p
3486 wakaba 1.12
3487 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3488 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3489 wakaba 1.1
3490     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3491     $self->{line_prev} = $self->{line};
3492     $self->{column_prev} = $self->{column};
3493     $self->{column}++;
3494     $self->{nc}
3495     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3496     } else {
3497     $self->{set_nc}->($self);
3498     }
3499    
3500     redo A;
3501     } elsif ($self->{nc} == 0x0053 or # S
3502     $self->{nc} == 0x0073) { # s
3503 wakaba 1.12
3504 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3505 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3506    
3507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508     $self->{line_prev} = $self->{line};
3509     $self->{column_prev} = $self->{column};
3510     $self->{column}++;
3511     $self->{nc}
3512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3513     } else {
3514     $self->{set_nc}->($self);
3515     }
3516    
3517     redo A;
3518 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3519     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3520     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3521    
3522     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3523     $self->{ct}->{value} = ''; # ENTITY
3524    
3525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3526     $self->{line_prev} = $self->{line};
3527     $self->{column_prev} = $self->{column};
3528     $self->{column}++;
3529     $self->{nc}
3530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3531     } else {
3532     $self->{set_nc}->($self);
3533     }
3534    
3535     redo A;
3536     } elsif ($self->{nc} == 0x0027 and # '
3537     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3538     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3539    
3540     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3541     $self->{ct}->{value} = ''; # ENTITY
3542    
3543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3544     $self->{line_prev} = $self->{line};
3545     $self->{column_prev} = $self->{column};
3546     $self->{column}++;
3547     $self->{nc}
3548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3549     } else {
3550     $self->{set_nc}->($self);
3551     }
3552    
3553     redo A;
3554 wakaba 1.16 } elsif ($self->{is_xml} and
3555     $self->{ct}->{type} == DOCTYPE_TOKEN and
3556     $self->{nc} == 0x005B) { # [
3557 wakaba 1.12
3558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3560 wakaba 1.13 $self->{in_subset} = 1;
3561 wakaba 1.1
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3573 wakaba 1.1 redo A;
3574     } else {
3575 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3576    
3577     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3578    
3579     $self->{ct}->{quirks} = 1;
3580     $self->{state} = BOGUS_DOCTYPE_STATE;
3581     } else {
3582    
3583     $self->{state} = BOGUS_MD_STATE;
3584     }
3585 wakaba 1.1
3586    
3587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3588     $self->{line_prev} = $self->{line};
3589     $self->{column_prev} = $self->{column};
3590     $self->{column}++;
3591     $self->{nc}
3592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3593     } else {
3594     $self->{set_nc}->($self);
3595     }
3596    
3597     redo A;
3598     }
3599     } elsif ($self->{state} == PUBLIC_STATE) {
3600     ## ASCII case-insensitive
3601     if ($self->{nc} == [
3602     undef,
3603     0x0055, # U
3604     0x0042, # B
3605     0x004C, # L
3606     0x0049, # I
3607 wakaba 1.12 ]->[length $self->{kwd}] or
3608 wakaba 1.1 $self->{nc} == [
3609     undef,
3610     0x0075, # u
3611     0x0062, # b
3612     0x006C, # l
3613     0x0069, # i
3614 wakaba 1.12 ]->[length $self->{kwd}]) {
3615 wakaba 1.1
3616     ## Stay in the state.
3617 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3618 wakaba 1.1
3619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3620     $self->{line_prev} = $self->{line};
3621     $self->{column_prev} = $self->{column};
3622     $self->{column}++;
3623     $self->{nc}
3624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3625     } else {
3626     $self->{set_nc}->($self);
3627     }
3628    
3629     redo A;
3630 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3631 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3632     $self->{nc} == 0x0063)) { # c
3633 wakaba 1.12 if ($self->{is_xml} and
3634     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3635    
3636     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3637     text => 'PUBLIC',
3638     line => $self->{line_prev},
3639     column => $self->{column_prev} - 4);
3640     } else {
3641    
3642     }
3643 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3644    
3645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3646     $self->{line_prev} = $self->{line};
3647     $self->{column_prev} = $self->{column};
3648     $self->{column}++;
3649     $self->{nc}
3650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3651     } else {
3652     $self->{set_nc}->($self);
3653     }
3654    
3655     redo A;
3656     } else {
3657 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3658 wakaba 1.1 line => $self->{line_prev},
3659 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3660 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3661    
3662     $self->{ct}->{quirks} = 1;
3663     $self->{state} = BOGUS_DOCTYPE_STATE;
3664     } else {
3665    
3666     $self->{state} = BOGUS_MD_STATE;
3667     }
3668 wakaba 1.1 ## Reconsume.
3669     redo A;
3670     }
3671     } elsif ($self->{state} == SYSTEM_STATE) {
3672     ## ASCII case-insensitive
3673     if ($self->{nc} == [
3674     undef,
3675     0x0059, # Y
3676     0x0053, # S
3677     0x0054, # T
3678     0x0045, # E
3679 wakaba 1.12 ]->[length $self->{kwd}] or
3680 wakaba 1.1 $self->{nc} == [
3681     undef,
3682     0x0079, # y
3683     0x0073, # s
3684     0x0074, # t
3685     0x0065, # e
3686 wakaba 1.12 ]->[length $self->{kwd}]) {
3687 wakaba 1.1
3688     ## Stay in the state.
3689 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3690 wakaba 1.1
3691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692     $self->{line_prev} = $self->{line};
3693     $self->{column_prev} = $self->{column};
3694     $self->{column}++;
3695     $self->{nc}
3696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697     } else {
3698     $self->{set_nc}->($self);
3699     }
3700    
3701     redo A;
3702 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3703 wakaba 1.1 ($self->{nc} == 0x004D or # M
3704     $self->{nc} == 0x006D)) { # m
3705 wakaba 1.12 if ($self->{is_xml} and
3706     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3707    
3708     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3709     text => 'SYSTEM',
3710     line => $self->{line_prev},
3711     column => $self->{column_prev} - 4);
3712     } else {
3713    
3714     }
3715 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3716    
3717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3718     $self->{line_prev} = $self->{line};
3719     $self->{column_prev} = $self->{column};
3720     $self->{column}++;
3721     $self->{nc}
3722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3723     } else {
3724     $self->{set_nc}->($self);
3725     }
3726    
3727     redo A;
3728     } else {
3729 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3730 wakaba 1.1 line => $self->{line_prev},
3731 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3732 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3733    
3734     $self->{ct}->{quirks} = 1;
3735     $self->{state} = BOGUS_DOCTYPE_STATE;
3736     } else {
3737    
3738     $self->{state} = BOGUS_MD_STATE;
3739     }
3740 wakaba 1.1 ## Reconsume.
3741     redo A;
3742     }
3743     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3744     if ($is_space->{$self->{nc}}) {
3745    
3746     ## Stay in the state
3747    
3748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3749     $self->{line_prev} = $self->{line};
3750     $self->{column_prev} = $self->{column};
3751     $self->{column}++;
3752     $self->{nc}
3753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3754     } else {
3755     $self->{set_nc}->($self);
3756     }
3757    
3758     redo A;
3759     } elsif ($self->{nc} eq 0x0022) { # "
3760    
3761     $self->{ct}->{pubid} = ''; # DOCTYPE
3762     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3763    
3764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3765     $self->{line_prev} = $self->{line};
3766     $self->{column_prev} = $self->{column};
3767     $self->{column}++;
3768     $self->{nc}
3769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3770     } else {
3771     $self->{set_nc}->($self);
3772     }
3773    
3774     redo A;
3775     } elsif ($self->{nc} eq 0x0027) { # '
3776    
3777     $self->{ct}->{pubid} = ''; # DOCTYPE
3778     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     } elsif ($self->{nc} eq 0x003E) { # >
3792 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3793    
3794     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3795    
3796     $self->{state} = DATA_STATE;
3797     $self->{s_kwd} = '';
3798     $self->{ct}->{quirks} = 1;
3799     } else {
3800    
3801     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3802     }
3803 wakaba 1.1
3804    
3805     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3806     $self->{line_prev} = $self->{line};
3807     $self->{column_prev} = $self->{column};
3808     $self->{column}++;
3809     $self->{nc}
3810     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3811     } else {
3812     $self->{set_nc}->($self);
3813     }
3814    
3815 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3816 wakaba 1.1 redo A;
3817     } elsif ($self->{nc} == -1) {
3818 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3819    
3820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3821     $self->{state} = DATA_STATE;
3822     $self->{s_kwd} = '';
3823     $self->{ct}->{quirks} = 1;
3824     } else {
3825    
3826     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3827     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3828     }
3829 wakaba 1.1
3830     ## reconsume
3831     return ($self->{ct}); # DOCTYPE
3832     redo A;
3833 wakaba 1.16 } elsif ($self->{is_xml} and
3834     $self->{ct}->{type} == DOCTYPE_TOKEN and
3835     $self->{nc} == 0x005B) { # [
3836 wakaba 1.12
3837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3838     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3840 wakaba 1.13 $self->{in_subset} = 1;
3841 wakaba 1.12
3842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843     $self->{line_prev} = $self->{line};
3844     $self->{column_prev} = $self->{column};
3845     $self->{column}++;
3846     $self->{nc}
3847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848     } else {
3849     $self->{set_nc}->($self);
3850     }
3851    
3852 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3853 wakaba 1.12 redo A;
3854 wakaba 1.1 } else {
3855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3856    
3857 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3858    
3859     $self->{ct}->{quirks} = 1;
3860     $self->{state} = BOGUS_DOCTYPE_STATE;
3861     } else {
3862    
3863     $self->{state} = BOGUS_MD_STATE;
3864     }
3865    
3866 wakaba 1.1
3867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3868     $self->{line_prev} = $self->{line};
3869     $self->{column_prev} = $self->{column};
3870     $self->{column}++;
3871     $self->{nc}
3872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3873     } else {
3874     $self->{set_nc}->($self);
3875     }
3876    
3877     redo A;
3878     }
3879     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3880     if ($self->{nc} == 0x0022) { # "
3881    
3882     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3883    
3884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3885     $self->{line_prev} = $self->{line};
3886     $self->{column_prev} = $self->{column};
3887     $self->{column}++;
3888     $self->{nc}
3889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3890     } else {
3891     $self->{set_nc}->($self);
3892     }
3893    
3894     redo A;
3895     } elsif ($self->{nc} == 0x003E) { # >
3896     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3897    
3898 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3899    
3900     $self->{state} = DATA_STATE;
3901     $self->{s_kwd} = '';
3902     $self->{ct}->{quirks} = 1;
3903     } else {
3904    
3905     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3906     }
3907    
3908 wakaba 1.1
3909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910     $self->{line_prev} = $self->{line};
3911     $self->{column_prev} = $self->{column};
3912     $self->{column}++;
3913     $self->{nc}
3914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915     } else {
3916     $self->{set_nc}->($self);
3917     }
3918    
3919 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3920 wakaba 1.1 redo A;
3921     } elsif ($self->{nc} == -1) {
3922     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3923    
3924 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925    
3926     $self->{state} = DATA_STATE;
3927     $self->{s_kwd} = '';
3928     $self->{ct}->{quirks} = 1;
3929     } else {
3930    
3931     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3932     }
3933    
3934     ## Reconsume.
3935 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3936     redo A;
3937     } else {
3938    
3939 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3940 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3941     length $self->{ct}->{pubid});
3942    
3943     ## Stay in the state
3944    
3945     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3946     $self->{line_prev} = $self->{line};
3947     $self->{column_prev} = $self->{column};
3948     $self->{column}++;
3949     $self->{nc}
3950     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3951     } else {
3952     $self->{set_nc}->($self);
3953     }
3954    
3955     redo A;
3956     }
3957     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3958     if ($self->{nc} == 0x0027) { # '
3959    
3960     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3961    
3962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3963     $self->{line_prev} = $self->{line};
3964     $self->{column_prev} = $self->{column};
3965     $self->{column}++;
3966     $self->{nc}
3967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3968     } else {
3969     $self->{set_nc}->($self);
3970     }
3971    
3972     redo A;
3973     } elsif ($self->{nc} == 0x003E) { # >
3974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3975    
3976 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3977    
3978     $self->{state} = DATA_STATE;
3979     $self->{s_kwd} = '';
3980     $self->{ct}->{quirks} = 1;
3981     } else {
3982    
3983     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3984     }
3985    
3986 wakaba 1.1
3987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988     $self->{line_prev} = $self->{line};
3989     $self->{column_prev} = $self->{column};
3990     $self->{column}++;
3991     $self->{nc}
3992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3993     } else {
3994     $self->{set_nc}->($self);
3995     }
3996    
3997 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3998 wakaba 1.1 redo A;
3999     } elsif ($self->{nc} == -1) {
4000     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4001    
4002 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4003    
4004     $self->{state} = DATA_STATE;
4005     $self->{s_kwd} = '';
4006     $self->{ct}->{quirks} = 1;
4007     } else {
4008    
4009     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4010     }
4011    
4012 wakaba 1.1 ## reconsume
4013 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4014 wakaba 1.1 redo A;
4015     } else {
4016    
4017 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4018 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
4019     length $self->{ct}->{pubid});
4020    
4021     ## Stay in the state
4022    
4023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4024     $self->{line_prev} = $self->{line};
4025     $self->{column_prev} = $self->{column};
4026     $self->{column}++;
4027     $self->{nc}
4028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4029     } else {
4030     $self->{set_nc}->($self);
4031     }
4032    
4033     redo A;
4034     }
4035     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4036     if ($is_space->{$self->{nc}}) {
4037    
4038     ## Stay in the state
4039    
4040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041     $self->{line_prev} = $self->{line};
4042     $self->{column_prev} = $self->{column};
4043     $self->{column}++;
4044     $self->{nc}
4045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046     } else {
4047     $self->{set_nc}->($self);
4048     }
4049    
4050     redo A;
4051     } elsif ($self->{nc} == 0x0022) { # "
4052    
4053 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4054 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4055    
4056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4057     $self->{line_prev} = $self->{line};
4058     $self->{column_prev} = $self->{column};
4059     $self->{column}++;
4060     $self->{nc}
4061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4062     } else {
4063     $self->{set_nc}->($self);
4064     }
4065    
4066     redo A;
4067     } elsif ($self->{nc} == 0x0027) { # '
4068    
4069 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4070 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4071    
4072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4073     $self->{line_prev} = $self->{line};
4074     $self->{column_prev} = $self->{column};
4075     $self->{column}++;
4076     $self->{nc}
4077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4078     } else {
4079     $self->{set_nc}->($self);
4080     }
4081    
4082     redo A;
4083     } elsif ($self->{nc} == 0x003E) { # >
4084 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085     if ($self->{is_xml}) {
4086    
4087     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4088     } else {
4089    
4090     }
4091     $self->{state} = DATA_STATE;
4092     $self->{s_kwd} = '';
4093 wakaba 1.12 } else {
4094 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4095    
4096     } else {
4097    
4098     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4099     }
4100     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4101 wakaba 1.12 }
4102 wakaba 1.16
4103 wakaba 1.1
4104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4105     $self->{line_prev} = $self->{line};
4106     $self->{column_prev} = $self->{column};
4107     $self->{column}++;
4108     $self->{nc}
4109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4110     } else {
4111     $self->{set_nc}->($self);
4112     }
4113    
4114 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4115 wakaba 1.1 redo A;
4116     } elsif ($self->{nc} == -1) {
4117 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118    
4119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4120    
4121     $self->{state} = DATA_STATE;
4122     $self->{s_kwd} = '';
4123     $self->{ct}->{quirks} = 1;
4124     } else {
4125     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4126     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4127     }
4128 wakaba 1.1
4129     ## reconsume
4130 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4131 wakaba 1.1 redo A;
4132 wakaba 1.16 } elsif ($self->{is_xml} and
4133     $self->{ct}->{type} == DOCTYPE_TOKEN and
4134     $self->{nc} == 0x005B) { # [
4135 wakaba 1.12
4136     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4137     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4138     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4139 wakaba 1.13 $self->{in_subset} = 1;
4140 wakaba 1.12
4141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4142     $self->{line_prev} = $self->{line};
4143     $self->{column_prev} = $self->{column};
4144     $self->{column}++;
4145     $self->{nc}
4146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4147     } else {
4148     $self->{set_nc}->($self);
4149     }
4150    
4151 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4152 wakaba 1.12 redo A;
4153 wakaba 1.1 } else {
4154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4155    
4156 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4157    
4158     $self->{ct}->{quirks} = 1;
4159     $self->{state} = BOGUS_DOCTYPE_STATE;
4160     } else {
4161    
4162     $self->{state} = BOGUS_MD_STATE;
4163     }
4164    
4165 wakaba 1.1
4166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4167     $self->{line_prev} = $self->{line};
4168     $self->{column_prev} = $self->{column};
4169     $self->{column}++;
4170     $self->{nc}
4171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4172     } else {
4173     $self->{set_nc}->($self);
4174     }
4175    
4176     redo A;
4177     }
4178     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4179     if ($is_space->{$self->{nc}}) {
4180    
4181     ## Stay in the state
4182    
4183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4184     $self->{line_prev} = $self->{line};
4185     $self->{column_prev} = $self->{column};
4186     $self->{column}++;
4187     $self->{nc}
4188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4189     } else {
4190     $self->{set_nc}->($self);
4191     }
4192    
4193     redo A;
4194     } elsif ($self->{nc} == 0x0022) { # "
4195    
4196     $self->{ct}->{sysid} = ''; # DOCTYPE
4197     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4198    
4199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4200     $self->{line_prev} = $self->{line};
4201     $self->{column_prev} = $self->{column};
4202     $self->{column}++;
4203     $self->{nc}
4204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4205     } else {
4206     $self->{set_nc}->($self);
4207     }
4208    
4209     redo A;
4210     } elsif ($self->{nc} == 0x0027) { # '
4211    
4212     $self->{ct}->{sysid} = ''; # DOCTYPE
4213     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4214    
4215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4216     $self->{line_prev} = $self->{line};
4217     $self->{column_prev} = $self->{column};
4218     $self->{column}++;
4219     $self->{nc}
4220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4221     } else {
4222     $self->{set_nc}->($self);
4223     }
4224    
4225     redo A;
4226     } elsif ($self->{nc} == 0x003E) { # >
4227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4228    
4229     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4230     $self->{line_prev} = $self->{line};
4231     $self->{column_prev} = $self->{column};
4232     $self->{column}++;
4233     $self->{nc}
4234     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4235     } else {
4236     $self->{set_nc}->($self);
4237     }
4238    
4239    
4240 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4241    
4242     $self->{state} = DATA_STATE;
4243     $self->{s_kwd} = '';
4244     $self->{ct}->{quirks} = 1;
4245     } else {
4246    
4247     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4248     }
4249 wakaba 1.1
4250 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4251 wakaba 1.1 redo A;
4252     } elsif ($self->{nc} == -1) {
4253 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4254    
4255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4256     $self->{state} = DATA_STATE;
4257     $self->{s_kwd} = '';
4258     $self->{ct}->{quirks} = 1;
4259     } else {
4260    
4261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4262     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263     }
4264 wakaba 1.1
4265     ## reconsume
4266 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 wakaba 1.1 redo A;
4268 wakaba 1.16 } elsif ($self->{is_xml} and
4269     $self->{ct}->{type} == DOCTYPE_TOKEN and
4270     $self->{nc} == 0x005B) { # [
4271 wakaba 1.12
4272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4273    
4274     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4276 wakaba 1.13 $self->{in_subset} = 1;
4277 wakaba 1.12
4278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4279     $self->{line_prev} = $self->{line};
4280     $self->{column_prev} = $self->{column};
4281     $self->{column}++;
4282     $self->{nc}
4283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4284     } else {
4285     $self->{set_nc}->($self);
4286     }
4287    
4288 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4289 wakaba 1.12 redo A;
4290 wakaba 1.1 } else {
4291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4292    
4293 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4294    
4295     $self->{ct}->{quirks} = 1;
4296     $self->{state} = BOGUS_DOCTYPE_STATE;
4297     } else {
4298    
4299     $self->{state} = BOGUS_MD_STATE;
4300     }
4301    
4302 wakaba 1.1
4303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4304     $self->{line_prev} = $self->{line};
4305     $self->{column_prev} = $self->{column};
4306     $self->{column}++;
4307     $self->{nc}
4308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4309     } else {
4310     $self->{set_nc}->($self);
4311     }
4312    
4313     redo A;
4314     }
4315     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4316     if ($self->{nc} == 0x0022) { # "
4317    
4318     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4319    
4320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4321     $self->{line_prev} = $self->{line};
4322     $self->{column_prev} = $self->{column};
4323     $self->{column}++;
4324     $self->{nc}
4325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4326     } else {
4327     $self->{set_nc}->($self);
4328     }
4329    
4330     redo A;
4331 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4332 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4333    
4334 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4335    
4336     $self->{state} = DATA_STATE;
4337     $self->{s_kwd} = '';
4338     $self->{ct}->{quirks} = 1;
4339     } else {
4340    
4341     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342     }
4343    
4344 wakaba 1.1
4345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346     $self->{line_prev} = $self->{line};
4347     $self->{column_prev} = $self->{column};
4348     $self->{column}++;
4349     $self->{nc}
4350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351     } else {
4352     $self->{set_nc}->($self);
4353     }
4354    
4355 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4356 wakaba 1.1 redo A;
4357     } elsif ($self->{nc} == -1) {
4358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4359    
4360 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361    
4362     $self->{state} = DATA_STATE;
4363     $self->{s_kwd} = '';
4364     $self->{ct}->{quirks} = 1;
4365     } else {
4366    
4367     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4368     }
4369    
4370 wakaba 1.1 ## reconsume
4371 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4372 wakaba 1.1 redo A;
4373     } else {
4374    
4375 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4376 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4377     length $self->{ct}->{sysid});
4378    
4379     ## Stay in the state
4380    
4381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4382     $self->{line_prev} = $self->{line};
4383     $self->{column_prev} = $self->{column};
4384     $self->{column}++;
4385     $self->{nc}
4386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4387     } else {
4388     $self->{set_nc}->($self);
4389     }
4390    
4391     redo A;
4392     }
4393     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4394     if ($self->{nc} == 0x0027) { # '
4395    
4396     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4397    
4398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4399     $self->{line_prev} = $self->{line};
4400     $self->{column_prev} = $self->{column};
4401     $self->{column}++;
4402     $self->{nc}
4403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4404     } else {
4405     $self->{set_nc}->($self);
4406     }
4407    
4408     redo A;
4409 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4410 wakaba 1.1
4411     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4412    
4413     $self->{state} = DATA_STATE;
4414 wakaba 1.5 $self->{s_kwd} = '';
4415 wakaba 1.1
4416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4417     $self->{line_prev} = $self->{line};
4418     $self->{column_prev} = $self->{column};
4419     $self->{column}++;
4420     $self->{nc}
4421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4422     } else {
4423     $self->{set_nc}->($self);
4424     }
4425    
4426    
4427     $self->{ct}->{quirks} = 1;
4428     return ($self->{ct}); # DOCTYPE
4429    
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4433    
4434 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4435    
4436     $self->{state} = DATA_STATE;
4437     $self->{s_kwd} = '';
4438     $self->{ct}->{quirks} = 1;
4439     } else {
4440    
4441     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4442     }
4443    
4444 wakaba 1.1 ## reconsume
4445 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4446 wakaba 1.1 redo A;
4447     } else {
4448    
4449 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4450 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4451     length $self->{ct}->{sysid});
4452    
4453     ## Stay in the state
4454    
4455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4456     $self->{line_prev} = $self->{line};
4457     $self->{column_prev} = $self->{column};
4458     $self->{column}++;
4459     $self->{nc}
4460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4461     } else {
4462     $self->{set_nc}->($self);
4463     }
4464    
4465     redo A;
4466     }
4467     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4468     if ($is_space->{$self->{nc}}) {
4469 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4470    
4471     $self->{state} = BEFORE_NDATA_STATE;
4472     } else {
4473    
4474     ## Stay in the state
4475     }
4476 wakaba 1.1
4477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4478     $self->{line_prev} = $self->{line};
4479     $self->{column_prev} = $self->{column};
4480     $self->{column}++;
4481     $self->{nc}
4482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4483     } else {
4484     $self->{set_nc}->($self);
4485     }
4486    
4487     redo A;
4488     } elsif ($self->{nc} == 0x003E) { # >
4489 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4490    
4491     $self->{state} = DATA_STATE;
4492     $self->{s_kwd} = '';
4493     } else {
4494    
4495     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4496     }
4497    
4498 wakaba 1.1
4499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4500     $self->{line_prev} = $self->{line};
4501     $self->{column_prev} = $self->{column};
4502     $self->{column}++;
4503     $self->{nc}
4504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4505     } else {
4506     $self->{set_nc}->($self);
4507     }
4508    
4509 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4510 wakaba 1.1 redo A;
4511 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4512     ($self->{nc} == 0x004E or # N
4513     $self->{nc} == 0x006E)) { # n
4514    
4515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4516     $self->{state} = NDATA_STATE;
4517     $self->{kwd} = chr $self->{nc};
4518    
4519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4520     $self->{line_prev} = $self->{line};
4521     $self->{column_prev} = $self->{column};
4522     $self->{column}++;
4523     $self->{nc}
4524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4525     } else {
4526     $self->{set_nc}->($self);
4527     }
4528    
4529     redo A;
4530 wakaba 1.1 } elsif ($self->{nc} == -1) {
4531 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4532    
4533     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4534     $self->{state} = DATA_STATE;
4535     $self->{s_kwd} = '';
4536     $self->{ct}->{quirks} = 1;
4537     } else {
4538    
4539     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4540     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4541     }
4542    
4543 wakaba 1.1 ## reconsume
4544 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4545 wakaba 1.1 redo A;
4546 wakaba 1.16 } elsif ($self->{is_xml} and
4547     $self->{ct}->{type} == DOCTYPE_TOKEN and
4548     $self->{nc} == 0x005B) { # [
4549 wakaba 1.12
4550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4551     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4552 wakaba 1.13 $self->{in_subset} = 1;
4553 wakaba 1.12
4554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4555     $self->{line_prev} = $self->{line};
4556     $self->{column_prev} = $self->{column};
4557     $self->{column}++;
4558     $self->{nc}
4559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4560     } else {
4561     $self->{set_nc}->($self);
4562     }
4563    
4564 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4565 wakaba 1.12 redo A;
4566 wakaba 1.1 } else {
4567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4568    
4569 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4570    
4571     #$self->{ct}->{quirks} = 1;
4572     $self->{state} = BOGUS_DOCTYPE_STATE;
4573     } else {
4574    
4575     $self->{state} = BOGUS_MD_STATE;
4576     }
4577    
4578 wakaba 1.1
4579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4580     $self->{line_prev} = $self->{line};
4581     $self->{column_prev} = $self->{column};
4582     $self->{column}++;
4583     $self->{nc}
4584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4585     } else {
4586     $self->{set_nc}->($self);
4587     }
4588    
4589     redo A;
4590     }
4591 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4592     if ($is_space->{$self->{nc}}) {
4593    
4594     ## Stay in the state.
4595    
4596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597     $self->{line_prev} = $self->{line};
4598     $self->{column_prev} = $self->{column};
4599     $self->{column}++;
4600     $self->{nc}
4601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602     } else {
4603     $self->{set_nc}->($self);
4604     }
4605    
4606     redo A;
4607     } elsif ($self->{nc} == 0x003E) { # >
4608    
4609     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4610    
4611     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4612     $self->{line_prev} = $self->{line};
4613     $self->{column_prev} = $self->{column};
4614     $self->{column}++;
4615     $self->{nc}
4616     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4617     } else {
4618     $self->{set_nc}->($self);
4619     }
4620    
4621     return ($self->{ct}); # ENTITY
4622     redo A;
4623     } elsif ($self->{nc} == 0x004E or # N
4624     $self->{nc} == 0x006E) { # n
4625    
4626     $self->{state} = NDATA_STATE;
4627     $self->{kwd} = chr $self->{nc};
4628    
4629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4630     $self->{line_prev} = $self->{line};
4631     $self->{column_prev} = $self->{column};
4632     $self->{column}++;
4633     $self->{nc}
4634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4635     } else {
4636     $self->{set_nc}->($self);
4637     }
4638    
4639     redo A;
4640     } elsif ($self->{nc} == -1) {
4641    
4642     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4643     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4644     ## reconsume
4645     return ($self->{ct}); # ENTITY
4646     redo A;
4647     } else {
4648    
4649     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4650     $self->{state} = BOGUS_MD_STATE;
4651    
4652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4653     $self->{line_prev} = $self->{line};
4654     $self->{column_prev} = $self->{column};
4655     $self->{column}++;
4656     $self->{nc}
4657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4658     } else {
4659     $self->{set_nc}->($self);
4660     }
4661    
4662     redo A;
4663     }
4664 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4665     if ($self->{nc} == 0x003E) { # >
4666    
4667     $self->{state} = DATA_STATE;
4668 wakaba 1.5 $self->{s_kwd} = '';
4669 wakaba 1.1
4670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4671     $self->{line_prev} = $self->{line};
4672     $self->{column_prev} = $self->{column};
4673     $self->{column}++;
4674     $self->{nc}
4675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4676     } else {
4677     $self->{set_nc}->($self);
4678     }
4679    
4680    
4681     return ($self->{ct}); # DOCTYPE
4682    
4683     redo A;
4684 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4685 wakaba 1.13
4686     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4687     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4688     $self->{in_subset} = 1;
4689    
4690 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691     $self->{line_prev} = $self->{line};
4692     $self->{column_prev} = $self->{column};
4693     $self->{column}++;
4694     $self->{nc}
4695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696     } else {
4697     $self->{set_nc}->($self);
4698     }
4699    
4700 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4701     redo A;
4702 wakaba 1.1 } elsif ($self->{nc} == -1) {
4703    
4704     $self->{state} = DATA_STATE;
4705 wakaba 1.5 $self->{s_kwd} = '';
4706 wakaba 1.1 ## reconsume
4707    
4708     return ($self->{ct}); # DOCTYPE
4709    
4710     redo A;
4711     } else {
4712    
4713     my $s = '';
4714 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4715 wakaba 1.1
4716     ## Stay in the state
4717    
4718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4719     $self->{line_prev} = $self->{line};
4720     $self->{column_prev} = $self->{column};
4721     $self->{column}++;
4722     $self->{nc}
4723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4724     } else {
4725     $self->{set_nc}->($self);
4726     }
4727    
4728     redo A;
4729     }
4730     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4731     ## NOTE: "CDATA section state" in the state is jointly implemented
4732     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4733     ## and |CDATA_SECTION_MSE2_STATE|.
4734 wakaba 1.10
4735     ## XML5: "CDATA state".
4736 wakaba 1.1
4737     if ($self->{nc} == 0x005D) { # ]
4738    
4739     $self->{state} = CDATA_SECTION_MSE1_STATE;
4740    
4741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4742     $self->{line_prev} = $self->{line};
4743     $self->{column_prev} = $self->{column};
4744     $self->{column}++;
4745     $self->{nc}
4746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4747     } else {
4748     $self->{set_nc}->($self);
4749     }
4750    
4751     redo A;
4752     } elsif ($self->{nc} == -1) {
4753 wakaba 1.6 if ($self->{is_xml}) {
4754 wakaba 1.8
4755 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4756 wakaba 1.8 } else {
4757    
4758 wakaba 1.6 }
4759    
4760 wakaba 1.1 $self->{state} = DATA_STATE;
4761 wakaba 1.5 $self->{s_kwd} = '';
4762 wakaba 1.10 ## Reconsume.
4763 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4764    
4765     return ($self->{ct}); # character
4766     } else {
4767    
4768     ## No token to emit. $self->{ct} is discarded.
4769     }
4770     redo A;
4771     } else {
4772    
4773     $self->{ct}->{data} .= chr $self->{nc};
4774     $self->{read_until}->($self->{ct}->{data},
4775     q<]>,
4776     length $self->{ct}->{data});
4777    
4778     ## Stay in the state.
4779    
4780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4781     $self->{line_prev} = $self->{line};
4782     $self->{column_prev} = $self->{column};
4783     $self->{column}++;
4784     $self->{nc}
4785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4786     } else {
4787     $self->{set_nc}->($self);
4788     }
4789    
4790     redo A;
4791     }
4792    
4793     ## ISSUE: "text tokens" in spec.
4794     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4795 wakaba 1.10 ## XML5: "CDATA bracket state".
4796    
4797 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4798    
4799     $self->{state} = CDATA_SECTION_MSE2_STATE;
4800    
4801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4802     $self->{line_prev} = $self->{line};
4803     $self->{column_prev} = $self->{column};
4804     $self->{column}++;
4805     $self->{nc}
4806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4807     } else {
4808     $self->{set_nc}->($self);
4809     }
4810    
4811     redo A;
4812     } else {
4813    
4814 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4815 wakaba 1.1 $self->{ct}->{data} .= ']';
4816 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4817 wakaba 1.1 ## Reconsume.
4818     redo A;
4819     }
4820     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4821 wakaba 1.10 ## XML5: "CDATA end state".
4822    
4823 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4824     $self->{state} = DATA_STATE;
4825 wakaba 1.5 $self->{s_kwd} = '';
4826 wakaba 1.1
4827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4828     $self->{line_prev} = $self->{line};
4829     $self->{column_prev} = $self->{column};
4830     $self->{column}++;
4831     $self->{nc}
4832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4833     } else {
4834     $self->{set_nc}->($self);
4835     }
4836    
4837     if (length $self->{ct}->{data}) { # character
4838    
4839     return ($self->{ct}); # character
4840     } else {
4841    
4842     ## No token to emit. $self->{ct} is discarded.
4843     }
4844     redo A;
4845     } elsif ($self->{nc} == 0x005D) { # ]
4846     # character
4847     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4848     ## Stay in the state.
4849    
4850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4851     $self->{line_prev} = $self->{line};
4852     $self->{column_prev} = $self->{column};
4853     $self->{column}++;
4854     $self->{nc}
4855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4856     } else {
4857     $self->{set_nc}->($self);
4858     }
4859    
4860     redo A;
4861     } else {
4862    
4863     $self->{ct}->{data} .= ']]'; # character
4864     $self->{state} = CDATA_SECTION_STATE;
4865 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4866 wakaba 1.1 redo A;
4867     }
4868     } elsif ($self->{state} == ENTITY_STATE) {
4869     if ($is_space->{$self->{nc}} or
4870     {
4871     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872     $self->{entity_add} => 1,
4873     }->{$self->{nc}}) {
4874 wakaba 1.22 if ($self->{is_xml}) {
4875    
4876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4877     line => $self->{line_prev},
4878     column => $self->{column_prev}
4879     + ($self->{nc} == -1 ? 1 : 0));
4880     } else {
4881    
4882     ## No error
4883     }
4884 wakaba 1.1 ## Don't consume
4885     ## Return nothing.
4886     #
4887     } elsif ($self->{nc} == 0x0023) { # #
4888    
4889     $self->{state} = ENTITY_HASH_STATE;
4890 wakaba 1.12 $self->{kwd} = '#';
4891 wakaba 1.1
4892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4893     $self->{line_prev} = $self->{line};
4894     $self->{column_prev} = $self->{column};
4895     $self->{column}++;
4896     $self->{nc}
4897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4898     } else {
4899     $self->{set_nc}->($self);
4900     }
4901    
4902     redo A;
4903 wakaba 1.22 } elsif ($self->{is_xml} or
4904     (0x0041 <= $self->{nc} and
4905 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4906     (0x0061 <= $self->{nc} and
4907     $self->{nc} <= 0x007A)) { # a..z
4908    
4909     require Whatpm::_NamedEntityList;
4910     $self->{state} = ENTITY_NAME_STATE;
4911 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4912     $self->{entity__value} = $self->{kwd};
4913 wakaba 1.1 $self->{entity__match} = 0;
4914    
4915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4916     $self->{line_prev} = $self->{line};
4917     $self->{column_prev} = $self->{column};
4918     $self->{column}++;
4919     $self->{nc}
4920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4921     } else {
4922     $self->{set_nc}->($self);
4923     }
4924    
4925     redo A;
4926     } else {
4927    
4928     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4929     ## Return nothing.
4930     #
4931     }
4932    
4933     ## NOTE: No character is consumed by the "consume a character
4934     ## reference" algorithm. In other word, there is an "&" character
4935     ## that does not introduce a character reference, which would be
4936     ## appended to the parent element or the attribute value in later
4937     ## process of the tokenizer.
4938    
4939     if ($self->{prev_state} == DATA_STATE) {
4940    
4941     $self->{state} = $self->{prev_state};
4942 wakaba 1.5 $self->{s_kwd} = '';
4943 wakaba 1.1 ## Reconsume.
4944     return ({type => CHARACTER_TOKEN, data => '&',
4945     line => $self->{line_prev},
4946     column => $self->{column_prev},
4947     });
4948     redo A;
4949     } else {
4950    
4951     $self->{ca}->{value} .= '&';
4952     $self->{state} = $self->{prev_state};
4953 wakaba 1.5 $self->{s_kwd} = '';
4954 wakaba 1.1 ## Reconsume.
4955     redo A;
4956     }
4957     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4958 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4959 wakaba 1.1
4960     $self->{state} = HEXREF_X_STATE;
4961 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4962 wakaba 1.1
4963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964     $self->{line_prev} = $self->{line};
4965     $self->{column_prev} = $self->{column};
4966     $self->{column}++;
4967     $self->{nc}
4968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969     } else {
4970     $self->{set_nc}->($self);
4971     }
4972    
4973     redo A;
4974 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4975    
4976     if ($self->{is_xml}) {
4977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4978     }
4979     $self->{state} = HEXREF_X_STATE;
4980     $self->{kwd} .= chr $self->{nc};
4981    
4982     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4983     $self->{line_prev} = $self->{line};
4984     $self->{column_prev} = $self->{column};
4985     $self->{column}++;
4986     $self->{nc}
4987     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4988     } else {
4989     $self->{set_nc}->($self);
4990     }
4991    
4992     redo A;
4993 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4994     $self->{nc} <= 0x0039) { # 0..9
4995    
4996     $self->{state} = NCR_NUM_STATE;
4997 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4998 wakaba 1.1
4999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5000     $self->{line_prev} = $self->{line};
5001     $self->{column_prev} = $self->{column};
5002     $self->{column}++;
5003     $self->{nc}
5004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5005     } else {
5006     $self->{set_nc}->($self);
5007     }
5008    
5009     redo A;
5010     } else {
5011     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
5012     line => $self->{line_prev},
5013     column => $self->{column_prev} - 1);
5014    
5015     ## NOTE: According to the spec algorithm, nothing is returned,
5016     ## and then "&#" is appended to the parent element or the attribute
5017     ## value in the later processing.
5018    
5019     if ($self->{prev_state} == DATA_STATE) {
5020    
5021     $self->{state} = $self->{prev_state};
5022 wakaba 1.5 $self->{s_kwd} = '';
5023 wakaba 1.1 ## Reconsume.
5024     return ({type => CHARACTER_TOKEN,
5025     data => '&#',
5026     line => $self->{line_prev},
5027     column => $self->{column_prev} - 1,
5028     });
5029     redo A;
5030     } else {
5031    
5032     $self->{ca}->{value} .= '&#';
5033     $self->{state} = $self->{prev_state};
5034 wakaba 1.5 $self->{s_kwd} = '';
5035 wakaba 1.1 ## Reconsume.
5036     redo A;
5037     }
5038     }
5039     } elsif ($self->{state} == NCR_NUM_STATE) {
5040     if (0x0030 <= $self->{nc} and
5041     $self->{nc} <= 0x0039) { # 0..9
5042    
5043 wakaba 1.12 $self->{kwd} *= 10;
5044     $self->{kwd} += $self->{nc} - 0x0030;
5045 wakaba 1.1
5046     ## Stay in the state.
5047    
5048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5049     $self->{line_prev} = $self->{line};
5050     $self->{column_prev} = $self->{column};
5051     $self->{column}++;
5052     $self->{nc}
5053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5054     } else {
5055     $self->{set_nc}->($self);
5056     }
5057    
5058     redo A;
5059     } elsif ($self->{nc} == 0x003B) { # ;
5060    
5061    
5062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5063     $self->{line_prev} = $self->{line};
5064     $self->{column_prev} = $self->{column};
5065     $self->{column}++;
5066     $self->{nc}
5067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5068     } else {
5069     $self->{set_nc}->($self);
5070     }
5071    
5072     #
5073     } else {
5074    
5075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5076     ## Reconsume.
5077     #
5078     }
5079    
5080 wakaba 1.12 my $code = $self->{kwd};
5081 wakaba 1.1 my $l = $self->{line_prev};
5082     my $c = $self->{column_prev};
5083 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5084     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5085     ($self->{is_xml} and $code == 0x0000)) {
5086 wakaba 1.1
5087     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5088     text => (sprintf 'U+%04X', $code),
5089     line => $l, column => $c);
5090     $code = $charref_map->{$code};
5091     } elsif ($code > 0x10FFFF) {
5092    
5093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5094     text => (sprintf 'U-%08X', $code),
5095     line => $l, column => $c);
5096     $code = 0xFFFD;
5097     }
5098    
5099     if ($self->{prev_state} == DATA_STATE) {
5100    
5101     $self->{state} = $self->{prev_state};
5102 wakaba 1.5 $self->{s_kwd} = '';
5103 wakaba 1.1 ## Reconsume.
5104     return ({type => CHARACTER_TOKEN, data => chr $code,
5105 wakaba 1.7 has_reference => 1,
5106 wakaba 1.1 line => $l, column => $c,
5107     });
5108     redo A;
5109     } else {
5110    
5111     $self->{ca}->{value} .= chr $code;
5112     $self->{ca}->{has_reference} = 1;
5113     $self->{state} = $self->{prev_state};
5114 wakaba 1.5 $self->{s_kwd} = '';
5115 wakaba 1.1 ## Reconsume.
5116     redo A;
5117     }
5118     } elsif ($self->{state} == HEXREF_X_STATE) {
5119     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5120     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5121     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5122     # 0..9, A..F, a..f
5123    
5124     $self->{state} = HEXREF_HEX_STATE;
5125 wakaba 1.12 $self->{kwd} = 0;
5126 wakaba 1.1 ## Reconsume.
5127     redo A;
5128     } else {
5129     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5130     line => $self->{line_prev},
5131     column => $self->{column_prev} - 2);
5132    
5133     ## NOTE: According to the spec algorithm, nothing is returned,
5134     ## and then "&#" followed by "X" or "x" is appended to the parent
5135     ## element or the attribute value in the later processing.
5136    
5137     if ($self->{prev_state} == DATA_STATE) {
5138    
5139     $self->{state} = $self->{prev_state};
5140 wakaba 1.5 $self->{s_kwd} = '';
5141 wakaba 1.1 ## Reconsume.
5142     return ({type => CHARACTER_TOKEN,
5143 wakaba 1.12 data => '&' . $self->{kwd},
5144 wakaba 1.1 line => $self->{line_prev},
5145 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
5146 wakaba 1.1 });
5147     redo A;
5148     } else {
5149    
5150 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
5151 wakaba 1.1 $self->{state} = $self->{prev_state};
5152 wakaba 1.5 $self->{s_kwd} = '';
5153 wakaba 1.1 ## Reconsume.
5154     redo A;
5155     }
5156     }
5157     } elsif ($self->{state} == HEXREF_HEX_STATE) {
5158     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5159     # 0..9
5160    
5161 wakaba 1.12 $self->{kwd} *= 0x10;
5162     $self->{kwd} += $self->{nc} - 0x0030;
5163 wakaba 1.1 ## Stay in the state.
5164    
5165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5166     $self->{line_prev} = $self->{line};
5167     $self->{column_prev} = $self->{column};
5168     $self->{column}++;
5169     $self->{nc}
5170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5171     } else {
5172     $self->{set_nc}->($self);
5173     }
5174    
5175     redo A;
5176     } elsif (0x0061 <= $self->{nc} and
5177     $self->{nc} <= 0x0066) { # a..f
5178    
5179 wakaba 1.12 $self->{kwd} *= 0x10;
5180     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5181 wakaba 1.1 ## Stay in the state.
5182    
5183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5184     $self->{line_prev} = $self->{line};
5185     $self->{column_prev} = $self->{column};
5186     $self->{column}++;
5187     $self->{nc}
5188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5189     } else {
5190     $self->{set_nc}->($self);
5191     }
5192    
5193     redo A;
5194     } elsif (0x0041 <= $self->{nc} and
5195     $self->{nc} <= 0x0046) { # A..F
5196    
5197 wakaba 1.12 $self->{kwd} *= 0x10;
5198     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5199 wakaba 1.1 ## Stay in the state.
5200    
5201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5202     $self->{line_prev} = $self->{line};
5203     $self->{column_prev} = $self->{column};
5204     $self->{column}++;
5205     $self->{nc}
5206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5207     } else {
5208     $self->{set_nc}->($self);
5209     }
5210    
5211     redo A;
5212     } elsif ($self->{nc} == 0x003B) { # ;
5213    
5214    
5215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5216     $self->{line_prev} = $self->{line};
5217     $self->{column_prev} = $self->{column};
5218     $self->{column}++;
5219     $self->{nc}
5220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5221     } else {
5222     $self->{set_nc}->($self);
5223     }
5224    
5225     #
5226     } else {
5227    
5228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5229     line => $self->{line},
5230     column => $self->{column});
5231     ## Reconsume.
5232     #
5233     }
5234    
5235 wakaba 1.12 my $code = $self->{kwd};
5236 wakaba 1.1 my $l = $self->{line_prev};
5237     my $c = $self->{column_prev};
5238 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5239     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5240     ($self->{is_xml} and $code == 0x0000)) {
5241 wakaba 1.1
5242     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5243     text => (sprintf 'U+%04X', $code),
5244     line => $l, column => $c);
5245     $code = $charref_map->{$code};
5246     } elsif ($code > 0x10FFFF) {
5247    
5248     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5249     text => (sprintf 'U-%08X', $code),
5250     line => $l, column => $c);
5251     $code = 0xFFFD;
5252     }
5253    
5254     if ($self->{prev_state} == DATA_STATE) {
5255    
5256     $self->{state} = $self->{prev_state};
5257 wakaba 1.5 $self->{s_kwd} = '';
5258 wakaba 1.1 ## Reconsume.
5259     return ({type => CHARACTER_TOKEN, data => chr $code,
5260 wakaba 1.7 has_reference => 1,
5261 wakaba 1.1 line => $l, column => $c,
5262     });
5263     redo A;
5264     } else {
5265    
5266     $self->{ca}->{value} .= chr $code;
5267     $self->{ca}->{has_reference} = 1;
5268     $self->{state} = $self->{prev_state};
5269 wakaba 1.5 $self->{s_kwd} = '';
5270 wakaba 1.1 ## Reconsume.
5271     redo A;
5272     }
5273     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5274 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5275     $self->{nc} <= 0x005A) or # x
5276     (0x0061 <= $self->{nc} and # a
5277     $self->{nc} <= 0x007A) or # z
5278     (0x0030 <= $self->{nc} and # 0
5279     $self->{nc} <= 0x0039) or # 9
5280 wakaba 1.22 $self->{nc} == 0x003B or # ;
5281     ($self->{is_xml} and
5282     not ($is_space->{$self->{nc}} or
5283     {
5284     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5285     $self->{entity_add} => 1,
5286     }->{$self->{nc}}))) {
5287 wakaba 1.1 our $EntityChar;
5288 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5289 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5290     $self->{ge}->{$self->{kwd}}) {
5291 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5292 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5293     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5294    
5295     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5296     } else {
5297     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5298    
5299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5300     value => $self->{kwd});
5301     } else {
5302    
5303     }
5304     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5305     }
5306     } else {
5307     if ($self->{is_xml}) {
5308    
5309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5310     value => $self->{kwd},
5311     level => {
5312     'amp;' => $self->{level}->{warn},
5313     'quot;' => $self->{level}->{warn},
5314     'lt;' => $self->{level}->{warn},
5315     'gt;' => $self->{level}->{warn},
5316     'apos;' => $self->{level}->{warn},
5317     }->{$self->{kwd}} ||
5318     $self->{level}->{must});
5319     } else {
5320    
5321     }
5322     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5323     }
5324 wakaba 1.1 $self->{entity__match} = 1;
5325    
5326     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5327     $self->{line_prev} = $self->{line};
5328     $self->{column_prev} = $self->{column};
5329     $self->{column}++;
5330     $self->{nc}
5331     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5332     } else {
5333     $self->{set_nc}->($self);
5334     }
5335    
5336     #
5337     } else {
5338    
5339 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5340 wakaba 1.1 $self->{entity__match} = -1;
5341     ## Stay in the state.
5342    
5343     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5344     $self->{line_prev} = $self->{line};
5345     $self->{column_prev} = $self->{column};
5346     $self->{column}++;
5347     $self->{nc}
5348     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5349     } else {
5350     $self->{set_nc}->($self);
5351     }
5352    
5353     redo A;
5354     }
5355     } else {
5356    
5357     $self->{entity__value} .= chr $self->{nc};
5358     $self->{entity__match} *= 2;
5359     ## Stay in the state.
5360    
5361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5362     $self->{line_prev} = $self->{line};
5363     $self->{column_prev} = $self->{column};
5364     $self->{column}++;
5365     $self->{nc}
5366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5367     } else {
5368     $self->{set_nc}->($self);
5369     }
5370    
5371     redo A;
5372     }
5373     }
5374    
5375     my $data;
5376     my $has_ref;
5377     if ($self->{entity__match} > 0) {
5378    
5379     $data = $self->{entity__value};
5380     $has_ref = 1;
5381     #
5382     } elsif ($self->{entity__match} < 0) {
5383     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5384     if ($self->{prev_state} != DATA_STATE and # in attribute
5385     $self->{entity__match} < -1) {
5386    
5387 wakaba 1.12 $data = '&' . $self->{kwd};
5388 wakaba 1.1 #
5389     } else {
5390    
5391     $data = $self->{entity__value};
5392     $has_ref = 1;
5393     #
5394     }
5395     } else {
5396    
5397     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5398     line => $self->{line_prev},
5399 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5400     $data = '&' . $self->{kwd};
5401 wakaba 1.1 #
5402     }
5403    
5404     ## NOTE: In these cases, when a character reference is found,
5405     ## it is consumed and a character token is returned, or, otherwise,
5406     ## nothing is consumed and returned, according to the spec algorithm.
5407     ## In this implementation, anything that has been examined by the
5408     ## tokenizer is appended to the parent element or the attribute value
5409     ## as string, either literal string when no character reference or
5410     ## entity-replaced string otherwise, in this stage, since any characters
5411     ## that would not be consumed are appended in the data state or in an
5412     ## appropriate attribute value state anyway.
5413    
5414     if ($self->{prev_state} == DATA_STATE) {
5415    
5416     $self->{state} = $self->{prev_state};
5417 wakaba 1.5 $self->{s_kwd} = '';
5418 wakaba 1.1 ## Reconsume.
5419     return ({type => CHARACTER_TOKEN,
5420     data => $data,
5421 wakaba 1.7 has_reference => $has_ref,
5422 wakaba 1.1 line => $self->{line_prev},
5423 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5424 wakaba 1.1 });
5425     redo A;
5426     } else {
5427    
5428     $self->{ca}->{value} .= $data;
5429     $self->{ca}->{has_reference} = 1 if $has_ref;
5430     $self->{state} = $self->{prev_state};
5431 wakaba 1.5 $self->{s_kwd} = '';
5432 wakaba 1.1 ## Reconsume.
5433     redo A;
5434     }
5435 wakaba 1.8
5436     ## XML-only states
5437    
5438     } elsif ($self->{state} == PI_STATE) {
5439 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5440    
5441 wakaba 1.8 if ($is_space->{$self->{nc}} or
5442 wakaba 1.14 $self->{nc} == 0x003F or # ?
5443 wakaba 1.8 $self->{nc} == -1) {
5444 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5445     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5446     ## "DOCTYPE pi state": Parse error, switch to the "data
5447     ## state".
5448 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5449     line => $self->{line_prev},
5450     column => $self->{column_prev}
5451     - 1 * ($self->{nc} != -1));
5452     $self->{state} = BOGUS_COMMENT_STATE;
5453     ## Reconsume.
5454     $self->{ct} = {type => COMMENT_TOKEN,
5455     data => '?',
5456     line => $self->{line_prev},
5457     column => $self->{column_prev}
5458     - 1 * ($self->{nc} != -1),
5459     };
5460     redo A;
5461     } else {
5462 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5463 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5464     target => chr $self->{nc},
5465     data => '',
5466     line => $self->{line_prev},
5467     column => $self->{column_prev} - 1,
5468     };
5469     $self->{state} = PI_TARGET_STATE;
5470    
5471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5472     $self->{line_prev} = $self->{line};
5473     $self->{column_prev} = $self->{column};
5474     $self->{column}++;
5475     $self->{nc}
5476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5477     } else {
5478     $self->{set_nc}->($self);
5479     }
5480    
5481     redo A;
5482     }
5483     } elsif ($self->{state} == PI_TARGET_STATE) {
5484     if ($is_space->{$self->{nc}}) {
5485     $self->{state} = PI_TARGET_AFTER_STATE;
5486    
5487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5488     $self->{line_prev} = $self->{line};
5489     $self->{column_prev} = $self->{column};
5490     $self->{column}++;
5491     $self->{nc}
5492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5493     } else {
5494     $self->{set_nc}->($self);
5495     }
5496    
5497     redo A;
5498     } elsif ($self->{nc} == -1) {
5499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5500 wakaba 1.13 if ($self->{in_subset}) {
5501     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5502     } else {
5503     $self->{state} = DATA_STATE;
5504     $self->{s_kwd} = '';
5505     }
5506 wakaba 1.8 ## Reconsume.
5507     return ($self->{ct}); # pi
5508     redo A;
5509     } elsif ($self->{nc} == 0x003F) { # ?
5510     $self->{state} = PI_AFTER_STATE;
5511    
5512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5513     $self->{line_prev} = $self->{line};
5514     $self->{column_prev} = $self->{column};
5515     $self->{column}++;
5516     $self->{nc}
5517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5518     } else {
5519     $self->{set_nc}->($self);
5520     }
5521    
5522     redo A;
5523     } else {
5524     ## XML5: typo ("tag name" -> "target")
5525     $self->{ct}->{target} .= chr $self->{nc}; # pi
5526    
5527     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5528     $self->{line_prev} = $self->{line};
5529     $self->{column_prev} = $self->{column};
5530     $self->{column}++;
5531     $self->{nc}
5532     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5533     } else {
5534     $self->{set_nc}->($self);
5535     }
5536    
5537     redo A;
5538     }
5539     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5540     if ($is_space->{$self->{nc}}) {
5541     ## Stay in the state.
5542    
5543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5544     $self->{line_prev} = $self->{line};
5545     $self->{column_prev} = $self->{column};
5546     $self->{column}++;
5547     $self->{nc}
5548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5549     } else {
5550     $self->{set_nc}->($self);
5551     }
5552    
5553     redo A;
5554     } else {
5555     $self->{state} = PI_DATA_STATE;
5556     ## Reprocess.
5557     redo A;
5558     }
5559     } elsif ($self->{state} == PI_DATA_STATE) {
5560     if ($self->{nc} == 0x003F) { # ?
5561     $self->{state} = PI_DATA_AFTER_STATE;
5562    
5563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5564     $self->{line_prev} = $self->{line};
5565     $self->{column_prev} = $self->{column};
5566     $self->{column}++;
5567     $self->{nc}
5568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5569     } else {
5570     $self->{set_nc}->($self);
5571     }
5572    
5573     redo A;
5574     } elsif ($self->{nc} == -1) {
5575     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5576 wakaba 1.13 if ($self->{in_subset}) {
5577 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5578 wakaba 1.13 } else {
5579     $self->{state} = DATA_STATE;
5580     $self->{s_kwd} = '';
5581     }
5582 wakaba 1.8 ## Reprocess.
5583     return ($self->{ct}); # pi
5584     redo A;
5585     } else {
5586     $self->{ct}->{data} .= chr $self->{nc}; # pi
5587     $self->{read_until}->($self->{ct}->{data}, q[?],
5588     length $self->{ct}->{data});
5589     ## Stay in the state.
5590    
5591     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5592     $self->{line_prev} = $self->{line};
5593     $self->{column_prev} = $self->{column};
5594     $self->{column}++;
5595     $self->{nc}
5596     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5597     } else {
5598     $self->{set_nc}->($self);
5599     }
5600    
5601     ## Reprocess.
5602     redo A;
5603     }
5604     } elsif ($self->{state} == PI_AFTER_STATE) {
5605 wakaba 1.14 ## XML5: Part of "Pi after state".
5606    
5607 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5608 wakaba 1.13 if ($self->{in_subset}) {
5609     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5610     } else {
5611     $self->{state} = DATA_STATE;
5612     $self->{s_kwd} = '';
5613     }
5614 wakaba 1.8
5615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5616     $self->{line_prev} = $self->{line};
5617     $self->{column_prev} = $self->{column};
5618     $self->{column}++;
5619     $self->{nc}
5620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5621     } else {
5622     $self->{set_nc}->($self);
5623     }
5624    
5625     return ($self->{ct}); # pi
5626     redo A;
5627     } elsif ($self->{nc} == 0x003F) { # ?
5628     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5629     line => $self->{line_prev},
5630     column => $self->{column_prev}); ## XML5: no error
5631     $self->{ct}->{data} .= '?';
5632     $self->{state} = PI_DATA_AFTER_STATE;
5633    
5634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635     $self->{line_prev} = $self->{line};
5636     $self->{column_prev} = $self->{column};
5637     $self->{column}++;
5638     $self->{nc}
5639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640     } else {
5641     $self->{set_nc}->($self);
5642     }
5643    
5644     redo A;
5645     } else {
5646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5647     line => $self->{line_prev},
5648     column => $self->{column_prev}
5649     + 1 * ($self->{nc} == -1)); ## XML5: no error
5650     $self->{ct}->{data} .= '?'; ## XML5: not appended
5651     $self->{state} = PI_DATA_STATE;
5652     ## Reprocess.
5653     redo A;
5654     }
5655     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5656 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5657    
5658 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5659 wakaba 1.13 if ($self->{in_subset}) {
5660     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5661     } else {
5662     $self->{state} = DATA_STATE;
5663     $self->{s_kwd} = '';
5664     }
5665 wakaba 1.8
5666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667     $self->{line_prev} = $self->{line};
5668     $self->{column_prev} = $self->{column};
5669     $self->{column}++;
5670     $self->{nc}
5671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672     } else {
5673     $self->{set_nc}->($self);
5674     }
5675    
5676     return ($self->{ct}); # pi
5677     redo A;
5678     } elsif ($self->{nc} == 0x003F) { # ?
5679     $self->{ct}->{data} .= '?';
5680     ## Stay in the state.
5681    
5682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5683     $self->{line_prev} = $self->{line};
5684     $self->{column_prev} = $self->{column};
5685     $self->{column}++;
5686     $self->{nc}
5687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5688     } else {
5689     $self->{set_nc}->($self);
5690     }
5691    
5692     redo A;
5693     } else {
5694     $self->{ct}->{data} .= '?'; ## XML5: not appended
5695     $self->{state} = PI_DATA_STATE;
5696     ## Reprocess.
5697     redo A;
5698     }
5699 wakaba 1.12
5700     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5701     if ($self->{nc} == 0x003C) { # <
5702 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5703 wakaba 1.12
5704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705     $self->{line_prev} = $self->{line};
5706     $self->{column_prev} = $self->{column};
5707     $self->{column}++;
5708     $self->{nc}
5709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710     } else {
5711     $self->{set_nc}->($self);
5712     }
5713    
5714     redo A;
5715     } elsif ($self->{nc} == 0x0025) { # %
5716     ## XML5: Not defined yet.
5717    
5718     ## TODO:
5719 wakaba 1.24
5720     if (not $self->{stop_processing} and
5721     not $self->{document}->xml_standalone) {
5722     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5723     level => $self->{level}->{info});
5724     $self->{stop_processing} = 1;
5725     }
5726    
5727 wakaba 1.12
5728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5729     $self->{line_prev} = $self->{line};
5730     $self->{column_prev} = $self->{column};
5731     $self->{column}++;
5732     $self->{nc}
5733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5734     } else {
5735     $self->{set_nc}->($self);
5736     }
5737    
5738     redo A;
5739     } elsif ($self->{nc} == 0x005D) { # ]
5740 wakaba 1.13 delete $self->{in_subset};
5741 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5742    
5743     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5744     $self->{line_prev} = $self->{line};
5745     $self->{column_prev} = $self->{column};
5746     $self->{column}++;
5747     $self->{nc}
5748     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5749     } else {
5750     $self->{set_nc}->($self);
5751     }
5752    
5753     redo A;
5754     } elsif ($is_space->{$self->{nc}}) {
5755     ## Stay in the state.
5756    
5757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5758     $self->{line_prev} = $self->{line};
5759     $self->{column_prev} = $self->{column};
5760     $self->{column}++;
5761     $self->{nc}
5762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5763     } else {
5764     $self->{set_nc}->($self);
5765     }
5766    
5767     redo A;
5768     } elsif ($self->{nc} == -1) {
5769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5770 wakaba 1.13 delete $self->{in_subset};
5771 wakaba 1.12 $self->{state} = DATA_STATE;
5772     $self->{s_kwd} = '';
5773     ## Reconsume.
5774 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5775 wakaba 1.12 redo A;
5776     } else {
5777     unless ($self->{internal_subset_tainted}) {
5778     ## XML5: No parse error.
5779     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5780     $self->{internal_subset_tainted} = 1;
5781     }
5782     ## Stay in the state.
5783    
5784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785     $self->{line_prev} = $self->{line};
5786     $self->{column_prev} = $self->{column};
5787     $self->{column}++;
5788     $self->{nc}
5789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790     } else {
5791     $self->{set_nc}->($self);
5792     }
5793    
5794     redo A;
5795     }
5796     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5797     if ($self->{nc} == 0x003E) { # >
5798     $self->{state} = DATA_STATE;
5799     $self->{s_kwd} = '';
5800    
5801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5802     $self->{line_prev} = $self->{line};
5803     $self->{column_prev} = $self->{column};
5804     $self->{column}++;
5805     $self->{nc}
5806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5807     } else {
5808     $self->{set_nc}->($self);
5809     }
5810    
5811 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5812 wakaba 1.12 redo A;
5813     } elsif ($self->{nc} == -1) {
5814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5815     $self->{state} = DATA_STATE;
5816     $self->{s_kwd} = '';
5817     ## Reconsume.
5818 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5819 wakaba 1.12 redo A;
5820     } else {
5821     ## XML5: No parse error and stay in the state.
5822     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5823    
5824 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5825    
5826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827     $self->{line_prev} = $self->{line};
5828     $self->{column_prev} = $self->{column};
5829     $self->{column}++;
5830     $self->{nc}
5831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832     } else {
5833     $self->{set_nc}->($self);
5834     }
5835    
5836     redo A;
5837     }
5838     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5839     if ($self->{nc} == 0x003E) { # >
5840     $self->{state} = DATA_STATE;
5841     $self->{s_kwd} = '';
5842    
5843     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5844     $self->{line_prev} = $self->{line};
5845     $self->{column_prev} = $self->{column};
5846     $self->{column}++;
5847     $self->{nc}
5848     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5849     } else {
5850     $self->{set_nc}->($self);
5851     }
5852    
5853     return ({type => END_OF_DOCTYPE_TOKEN});
5854     redo A;
5855     } elsif ($self->{nc} == -1) {
5856     $self->{state} = DATA_STATE;
5857     $self->{s_kwd} = '';
5858     ## Reconsume.
5859     return ({type => END_OF_DOCTYPE_TOKEN});
5860     redo A;
5861     } else {
5862     ## Stay in the state.
5863    
5864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5865     $self->{line_prev} = $self->{line};
5866     $self->{column_prev} = $self->{column};
5867     $self->{column}++;
5868     $self->{nc}
5869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5870     } else {
5871     $self->{set_nc}->($self);
5872     }
5873    
5874     redo A;
5875     }
5876     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5877     if ($self->{nc} == 0x0021) { # !
5878 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5879 wakaba 1.13
5880     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5881     $self->{line_prev} = $self->{line};
5882     $self->{column_prev} = $self->{column};
5883     $self->{column}++;
5884     $self->{nc}
5885     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5886     } else {
5887     $self->{set_nc}->($self);
5888     }
5889    
5890     redo A;
5891     } elsif ($self->{nc} == 0x003F) { # ?
5892     $self->{state} = PI_STATE;
5893    
5894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895     $self->{line_prev} = $self->{line};
5896     $self->{column_prev} = $self->{column};
5897     $self->{column}++;
5898     $self->{nc}
5899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900     } else {
5901     $self->{set_nc}->($self);
5902     }
5903    
5904     redo A;
5905     } elsif ($self->{nc} == -1) {
5906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5907     $self->{state} = DATA_STATE;
5908     $self->{s_kwd} = '';
5909     ## Reconsume.
5910     redo A;
5911     } else {
5912     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5913     line => $self->{line_prev},
5914     column => $self->{column_prev});
5915     $self->{state} = BOGUS_COMMENT_STATE;
5916     $self->{ct} = {type => COMMENT_TOKEN,
5917     data => '',
5918     }; ## NOTE: Will be discarded.
5919 wakaba 1.12
5920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5921     $self->{line_prev} = $self->{line};
5922     $self->{column_prev} = $self->{column};
5923     $self->{column}++;
5924     $self->{nc}
5925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5926     } else {
5927     $self->{set_nc}->($self);
5928     }
5929    
5930     redo A;
5931     }
5932 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5933     ## XML5: "DOCTYPE markup declaration state".
5934    
5935     if ($self->{nc} == 0x002D) { # -
5936     $self->{state} = MD_HYPHEN_STATE;
5937    
5938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5939     $self->{line_prev} = $self->{line};
5940     $self->{column_prev} = $self->{column};
5941     $self->{column}++;
5942     $self->{nc}
5943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5944     } else {
5945     $self->{set_nc}->($self);
5946     }
5947    
5948     redo A;
5949 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5950     $self->{nc} == 0x0065) { # e
5951 wakaba 1.14 $self->{state} = MD_E_STATE;
5952     $self->{kwd} = chr $self->{nc};
5953    
5954     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5955     $self->{line_prev} = $self->{line};
5956     $self->{column_prev} = $self->{column};
5957     $self->{column}++;
5958     $self->{nc}
5959     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5960     } else {
5961     $self->{set_nc}->($self);
5962     }
5963    
5964     redo A;
5965 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5966     $self->{nc} == 0x0061) { # a
5967 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5968     $self->{kwd} = chr $self->{nc};
5969    
5970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5971     $self->{line_prev} = $self->{line};
5972     $self->{column_prev} = $self->{column};
5973     $self->{column}++;
5974     $self->{nc}
5975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5976     } else {
5977     $self->{set_nc}->($self);
5978     }
5979    
5980     redo A;
5981 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5982     $self->{nc} == 0x006E) { # n
5983 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5984     $self->{kwd} = chr $self->{nc};
5985    
5986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987     $self->{line_prev} = $self->{line};
5988     $self->{column_prev} = $self->{column};
5989     $self->{column}++;
5990     $self->{nc}
5991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992     } else {
5993     $self->{set_nc}->($self);
5994     }
5995    
5996     redo A;
5997     } else {
5998     #
5999     }
6000    
6001     ## XML5: No parse error.
6002     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6003     line => $self->{line_prev},
6004     column => $self->{column_prev} - 1);
6005     ## Reconsume.
6006     $self->{state} = BOGUS_COMMENT_STATE;
6007     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6008     redo A;
6009     } elsif ($self->{state} == MD_E_STATE) {
6010 wakaba 1.17 if ($self->{nc} == 0x004E or # N
6011     $self->{nc} == 0x006E) { # n
6012 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
6013     $self->{kwd} .= chr $self->{nc};
6014    
6015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6016     $self->{line_prev} = $self->{line};
6017     $self->{column_prev} = $self->{column};
6018     $self->{column}++;
6019     $self->{nc}
6020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6021     } else {
6022     $self->{set_nc}->($self);
6023     }
6024    
6025     redo A;
6026 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
6027     $self->{nc} == 0x006C) { # l
6028 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
6029     $self->{state} = MD_ELEMENT_STATE;
6030     $self->{kwd} .= chr $self->{nc};
6031    
6032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6033     $self->{line_prev} = $self->{line};
6034     $self->{column_prev} = $self->{column};
6035     $self->{column}++;
6036     $self->{nc}
6037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6038     } else {
6039     $self->{set_nc}->($self);
6040     }
6041    
6042     redo A;
6043     } else {
6044     ## XML5: No parse error.
6045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6046     line => $self->{line_prev},
6047     column => $self->{column_prev} - 2
6048     + 1 * ($self->{nc} == -1));
6049     ## Reconsume.
6050     $self->{state} = BOGUS_COMMENT_STATE;
6051     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6052     redo A;
6053     }
6054     } elsif ($self->{state} == MD_ENTITY_STATE) {
6055 wakaba 1.17 if ($self->{nc} == [
6056     undef,
6057     undef,
6058     0x0054, # T
6059     0x0049, # I
6060     0x0054, # T
6061     ]->[length $self->{kwd}] or
6062     $self->{nc} == [
6063     undef,
6064     undef,
6065     0x0074, # t
6066     0x0069, # i
6067     0x0074, # t
6068     ]->[length $self->{kwd}]) {
6069 wakaba 1.14 ## Stay in the state.
6070     $self->{kwd} .= chr $self->{nc};
6071    
6072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073     $self->{line_prev} = $self->{line};
6074     $self->{column_prev} = $self->{column};
6075     $self->{column}++;
6076     $self->{nc}
6077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078     } else {
6079     $self->{set_nc}->($self);
6080     }
6081    
6082     redo A;
6083 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
6084     ($self->{nc} == 0x0059 or # Y
6085     $self->{nc} == 0x0079)) { # y
6086     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6087     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6088     text => 'ENTITY',
6089     line => $self->{line_prev},
6090     column => $self->{column_prev} - 4);
6091     }
6092     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6093 wakaba 1.14 line => $self->{line_prev},
6094     column => $self->{column_prev} - 6};
6095     $self->{state} = DOCTYPE_MD_STATE;
6096    
6097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6098     $self->{line_prev} = $self->{line};
6099     $self->{column_prev} = $self->{column};
6100     $self->{column}++;
6101     $self->{nc}
6102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6103     } else {
6104     $self->{set_nc}->($self);
6105     }
6106    
6107     redo A;
6108     } else {
6109     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6110     line => $self->{line_prev},
6111     column => $self->{column_prev} - 1
6112     - (length $self->{kwd})
6113     + 1 * ($self->{nc} == -1));
6114     $self->{state} = BOGUS_COMMENT_STATE;
6115     ## Reconsume.
6116     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6117     redo A;
6118     }
6119     } elsif ($self->{state} == MD_ELEMENT_STATE) {
6120 wakaba 1.17 if ($self->{nc} == [
6121     undef,
6122     undef,
6123     0x0045, # E
6124     0x004D, # M
6125     0x0045, # E
6126     0x004E, # N
6127     ]->[length $self->{kwd}] or
6128     $self->{nc} == [
6129     undef,
6130     undef,
6131     0x0065, # e
6132     0x006D, # m
6133     0x0065, # e
6134     0x006E, # n
6135     ]->[length $self->{kwd}]) {
6136 wakaba 1.14 ## Stay in the state.
6137     $self->{kwd} .= chr $self->{nc};
6138    
6139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140     $self->{line_prev} = $self->{line};
6141     $self->{column_prev} = $self->{column};
6142     $self->{column}++;
6143     $self->{nc}
6144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145     } else {
6146     $self->{set_nc}->($self);
6147     }
6148    
6149     redo A;
6150 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6151     ($self->{nc} == 0x0054 or # T
6152     $self->{nc} == 0x0074)) { # t
6153     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6155     text => 'ELEMENT',
6156     line => $self->{line_prev},
6157     column => $self->{column_prev} - 5);
6158     }
6159 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6160     line => $self->{line_prev},
6161 wakaba 1.23 column => $self->{column_prev} - 7};
6162 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6163    
6164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6165     $self->{line_prev} = $self->{line};
6166     $self->{column_prev} = $self->{column};
6167     $self->{column}++;
6168     $self->{nc}
6169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6170     } else {
6171     $self->{set_nc}->($self);
6172     }
6173    
6174     redo A;
6175     } else {
6176     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6177     line => $self->{line_prev},
6178     column => $self->{column_prev} - 1
6179     - (length $self->{kwd})
6180     + 1 * ($self->{nc} == -1));
6181     $self->{state} = BOGUS_COMMENT_STATE;
6182     ## Reconsume.
6183     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6184     redo A;
6185     }
6186     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6187 wakaba 1.17 if ($self->{nc} == [
6188     undef,
6189     0x0054, # T
6190     0x0054, # T
6191     0x004C, # L
6192     0x0049, # I
6193     0x0053, # S
6194     ]->[length $self->{kwd}] or
6195     $self->{nc} == [
6196     undef,
6197     0x0074, # t
6198     0x0074, # t
6199     0x006C, # l
6200     0x0069, # i
6201     0x0073, # s
6202     ]->[length $self->{kwd}]) {
6203 wakaba 1.14 ## Stay in the state.
6204     $self->{kwd} .= chr $self->{nc};
6205    
6206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6207     $self->{line_prev} = $self->{line};
6208     $self->{column_prev} = $self->{column};
6209     $self->{column}++;
6210     $self->{nc}
6211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6212     } else {
6213     $self->{set_nc}->($self);
6214     }
6215    
6216     redo A;
6217 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6218     ($self->{nc} == 0x0054 or # T
6219     $self->{nc} == 0x0074)) { # t
6220     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6222     text => 'ATTLIST',
6223     line => $self->{line_prev},
6224     column => $self->{column_prev} - 5);
6225     }
6226 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6227 wakaba 1.15 attrdefs => [],
6228 wakaba 1.14 line => $self->{line_prev},
6229 wakaba 1.23 column => $self->{column_prev} - 7};
6230 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6231    
6232     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233     $self->{line_prev} = $self->{line};
6234     $self->{column_prev} = $self->{column};
6235     $self->{column}++;
6236     $self->{nc}
6237     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238     } else {
6239     $self->{set_nc}->($self);
6240     }
6241    
6242     redo A;
6243     } else {
6244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6245     line => $self->{line_prev},
6246     column => $self->{column_prev} - 1
6247     - (length $self->{kwd})
6248     + 1 * ($self->{nc} == -1));
6249     $self->{state} = BOGUS_COMMENT_STATE;
6250     ## Reconsume.
6251     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6252     redo A;
6253     }
6254     } elsif ($self->{state} == MD_NOTATION_STATE) {
6255 wakaba 1.17 if ($self->{nc} == [
6256     undef,
6257     0x004F, # O
6258     0x0054, # T
6259     0x0041, # A
6260     0x0054, # T
6261     0x0049, # I
6262     0x004F, # O
6263     ]->[length $self->{kwd}] or
6264     $self->{nc} == [
6265     undef,
6266     0x006F, # o
6267     0x0074, # t
6268     0x0061, # a
6269     0x0074, # t
6270     0x0069, # i
6271     0x006F, # o
6272     ]->[length $self->{kwd}]) {
6273 wakaba 1.14 ## Stay in the state.
6274     $self->{kwd} .= chr $self->{nc};
6275    
6276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6277     $self->{line_prev} = $self->{line};
6278     $self->{column_prev} = $self->{column};
6279     $self->{column}++;
6280     $self->{nc}
6281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6282     } else {
6283     $self->{set_nc}->($self);
6284     }
6285    
6286     redo A;
6287 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6288     ($self->{nc} == 0x004E or # N
6289     $self->{nc} == 0x006E)) { # n
6290     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6292     text => 'NOTATION',
6293     line => $self->{line_prev},
6294     column => $self->{column_prev} - 6);
6295     }
6296 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6297     line => $self->{line_prev},
6298 wakaba 1.23 column => $self->{column_prev} - 8};
6299 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6300    
6301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302     $self->{line_prev} = $self->{line};
6303     $self->{column_prev} = $self->{column};
6304     $self->{column}++;
6305     $self->{nc}
6306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307     } else {
6308     $self->{set_nc}->($self);
6309     }
6310    
6311     redo A;
6312     } else {
6313     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6314     line => $self->{line_prev},
6315     column => $self->{column_prev} - 1
6316     - (length $self->{kwd})
6317     + 1 * ($self->{nc} == -1));
6318     $self->{state} = BOGUS_COMMENT_STATE;
6319     ## Reconsume.
6320     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6321     redo A;
6322     }
6323     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6324     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6325     ## "DOCTYPE NOTATION state".
6326    
6327     if ($is_space->{$self->{nc}}) {
6328     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6329     $self->{state} = BEFORE_MD_NAME_STATE;
6330    
6331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6332     $self->{line_prev} = $self->{line};
6333     $self->{column_prev} = $self->{column};
6334     $self->{column}++;
6335     $self->{nc}
6336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6337     } else {
6338     $self->{set_nc}->($self);
6339     }
6340    
6341     redo A;
6342     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6343     $self->{nc} == 0x0025) { # %
6344     ## XML5: Switch to the "DOCTYPE bogus comment state".
6345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6346     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6347    
6348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349     $self->{line_prev} = $self->{line};
6350     $self->{column_prev} = $self->{column};
6351     $self->{column}++;
6352     $self->{nc}
6353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354     } else {
6355     $self->{set_nc}->($self);
6356     }
6357    
6358     redo A;
6359     } elsif ($self->{nc} == -1) {
6360     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6361     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6362     ## Reconsume.
6363     redo A;
6364     } elsif ($self->{nc} == 0x003E) { # >
6365     ## XML5: Switch to the "DOCTYPE bogus comment state".
6366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6367     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6368    
6369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6370     $self->{line_prev} = $self->{line};
6371     $self->{column_prev} = $self->{column};
6372     $self->{column}++;
6373     $self->{nc}
6374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6375     } else {
6376     $self->{set_nc}->($self);
6377     }
6378    
6379     redo A;
6380     } else {
6381     ## XML5: Switch to the "DOCTYPE bogus comment state".
6382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6383     $self->{state} = BEFORE_MD_NAME_STATE;
6384     redo A;
6385     }
6386     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6387     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6388     ## before state", "DOCTYPE ATTLIST name before state".
6389    
6390     if ($is_space->{$self->{nc}}) {
6391     ## Stay in the state.
6392    
6393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394     $self->{line_prev} = $self->{line};
6395     $self->{column_prev} = $self->{column};
6396     $self->{column}++;
6397     $self->{nc}
6398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399     } else {
6400     $self->{set_nc}->($self);
6401     }
6402    
6403     redo A;
6404     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6405     $self->{nc} == 0x0025) { # %
6406     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6407    
6408     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409     $self->{line_prev} = $self->{line};
6410     $self->{column_prev} = $self->{column};
6411     $self->{column}++;
6412     $self->{nc}
6413     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414     } else {
6415     $self->{set_nc}->($self);
6416     }
6417    
6418     redo A;
6419     } elsif ($self->{nc} == 0x003E) { # >
6420     ## XML5: Same as "Anything else".
6421     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6422     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6423    
6424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6425     $self->{line_prev} = $self->{line};
6426     $self->{column_prev} = $self->{column};
6427     $self->{column}++;
6428     $self->{nc}
6429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6430     } else {
6431     $self->{set_nc}->($self);
6432     }
6433    
6434     redo A;
6435     } elsif ($self->{nc} == -1) {
6436     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6437     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6438     ## Reconsume.
6439     redo A;
6440     } else {
6441     ## XML5: [ATTLIST] Not defined yet.
6442     $self->{ct}->{name} .= chr $self->{nc};
6443     $self->{state} = MD_NAME_STATE;
6444    
6445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6446     $self->{line_prev} = $self->{line};
6447     $self->{column_prev} = $self->{column};
6448     $self->{column}++;
6449     $self->{nc}
6450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6451     } else {
6452     $self->{set_nc}->($self);
6453     }
6454    
6455     redo A;
6456     }
6457     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6458     if ($is_space->{$self->{nc}}) {
6459     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6460     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6461     $self->{state} = BEFORE_MD_NAME_STATE;
6462 wakaba 1.8
6463 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6464     $self->{line_prev} = $self->{line};
6465     $self->{column_prev} = $self->{column};
6466     $self->{column}++;
6467     $self->{nc}
6468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6469     } else {
6470     $self->{set_nc}->($self);
6471     }
6472    
6473     redo A;
6474     } elsif ($self->{nc} == 0x003E) { # >
6475     ## XML5: Same as "Anything else".
6476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6478    
6479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480     $self->{line_prev} = $self->{line};
6481     $self->{column_prev} = $self->{column};
6482     $self->{column}++;
6483     $self->{nc}
6484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485     } else {
6486     $self->{set_nc}->($self);
6487     }
6488    
6489     redo A;
6490     } elsif ($self->{nc} == -1) {
6491     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6492     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6493     ## Reconsume.
6494     redo A;
6495     } else {
6496     ## XML5: No parse error.
6497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6498     $self->{state} = BOGUS_COMMENT_STATE;
6499     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6500     ## Reconsume.
6501     redo A;
6502     }
6503     } elsif ($self->{state} == MD_NAME_STATE) {
6504     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6505    
6506     if ($is_space->{$self->{nc}}) {
6507 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6508     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6509     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6510 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6511 wakaba 1.16 } else { # ENTITY/NOTATION
6512     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6513     }
6514 wakaba 1.14
6515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6516     $self->{line_prev} = $self->{line};
6517     $self->{column_prev} = $self->{column};
6518     $self->{column}++;
6519     $self->{nc}
6520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6521     } else {
6522     $self->{set_nc}->($self);
6523     }
6524    
6525     redo A;
6526     } elsif ($self->{nc} == 0x003E) { # >
6527     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6528     #
6529     } else {
6530 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6531 wakaba 1.14 }
6532     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6533    
6534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535     $self->{line_prev} = $self->{line};
6536     $self->{column_prev} = $self->{column};
6537     $self->{column}++;
6538     $self->{nc}
6539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540     } else {
6541     $self->{set_nc}->($self);
6542     }
6543    
6544     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6545     redo A;
6546     } elsif ($self->{nc} == -1) {
6547     ## XML5: [ATTLIST] No parse error.
6548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6550     ## Reconsume.
6551     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6552     redo A;
6553     } else {
6554     ## XML5: [ATTLIST] Not defined yet.
6555     $self->{ct}->{name} .= chr $self->{nc};
6556     ## Stay in the state.
6557    
6558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6559     $self->{line_prev} = $self->{line};
6560     $self->{column_prev} = $self->{column};
6561     $self->{column}++;
6562     $self->{nc}
6563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6564     } else {
6565     $self->{set_nc}->($self);
6566     }
6567    
6568     redo A;
6569     }
6570     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6571     if ($is_space->{$self->{nc}}) {
6572     ## Stay in the state.
6573    
6574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575     $self->{line_prev} = $self->{line};
6576     $self->{column_prev} = $self->{column};
6577     $self->{column}++;
6578     $self->{nc}
6579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580     } else {
6581     $self->{set_nc}->($self);
6582     }
6583    
6584     redo A;
6585     } elsif ($self->{nc} == 0x003E) { # >
6586     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6587    
6588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6589     $self->{line_prev} = $self->{line};
6590     $self->{column_prev} = $self->{column};
6591     $self->{column}++;
6592     $self->{nc}
6593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6594     } else {
6595     $self->{set_nc}->($self);
6596     }
6597    
6598     return ($self->{ct}); # ATTLIST
6599     redo A;
6600     } elsif ($self->{nc} == -1) {
6601     ## XML5: No parse error.
6602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6603     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6604 wakaba 1.15 return ($self->{ct});
6605 wakaba 1.14 redo A;
6606     } else {
6607     ## XML5: Not defined yet.
6608 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6609     tokens => [],
6610     line => $self->{line}, column => $self->{column}};
6611     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6612    
6613     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6614     $self->{line_prev} = $self->{line};
6615     $self->{column_prev} = $self->{column};
6616     $self->{column}++;
6617     $self->{nc}
6618     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6619     } else {
6620     $self->{set_nc}->($self);
6621     }
6622    
6623     redo A;
6624     }
6625     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6626     if ($is_space->{$self->{nc}}) {
6627     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6628    
6629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6630     $self->{line_prev} = $self->{line};
6631     $self->{column_prev} = $self->{column};
6632     $self->{column}++;
6633     $self->{nc}
6634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6635     } else {
6636     $self->{set_nc}->($self);
6637     }
6638    
6639     redo A;
6640     } elsif ($self->{nc} == 0x003E) { # >
6641     ## XML5: Same as "anything else".
6642     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6643     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6644    
6645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646     $self->{line_prev} = $self->{line};
6647     $self->{column_prev} = $self->{column};
6648     $self->{column}++;
6649     $self->{nc}
6650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651     } else {
6652     $self->{set_nc}->($self);
6653     }
6654    
6655     return ($self->{ct}); # ATTLIST
6656     redo A;
6657     } elsif ($self->{nc} == 0x0028) { # (
6658     ## XML5: Same as "anything else".
6659     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6660     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6661    
6662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663     $self->{line_prev} = $self->{line};
6664     $self->{column_prev} = $self->{column};
6665     $self->{column}++;
6666     $self->{nc}
6667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668     } else {
6669     $self->{set_nc}->($self);
6670     }
6671    
6672     redo A;
6673     } elsif ($self->{nc} == -1) {
6674     ## XML5: No parse error.
6675     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6676     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6677    
6678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679     $self->{line_prev} = $self->{line};
6680     $self->{column_prev} = $self->{column};
6681     $self->{column}++;
6682     $self->{nc}
6683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684     } else {
6685     $self->{set_nc}->($self);
6686     }
6687    
6688     return ($self->{ct}); # ATTLIST
6689     redo A;
6690     } else {
6691     ## XML5: Not defined yet.
6692     $self->{ca}->{name} .= chr $self->{nc};
6693     ## Stay in the state.
6694    
6695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6696     $self->{line_prev} = $self->{line};
6697     $self->{column_prev} = $self->{column};
6698     $self->{column}++;
6699     $self->{nc}
6700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6701     } else {
6702     $self->{set_nc}->($self);
6703     }
6704    
6705 wakaba 1.14 redo A;
6706     }
6707 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6708     if ($is_space->{$self->{nc}}) {
6709     ## Stay in the state.
6710    
6711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712     $self->{line_prev} = $self->{line};
6713     $self->{column_prev} = $self->{column};
6714     $self->{column}++;
6715     $self->{nc}
6716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717     } else {
6718     $self->{set_nc}->($self);
6719     }
6720    
6721     redo A;
6722     } elsif ($self->{nc} == 0x003E) { # >
6723     ## XML5: Same as "anything else".
6724     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6725     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6726    
6727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728     $self->{line_prev} = $self->{line};
6729     $self->{column_prev} = $self->{column};
6730     $self->{column}++;
6731     $self->{nc}
6732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733     } else {
6734     $self->{set_nc}->($self);
6735     }
6736    
6737     return ($self->{ct}); # ATTLIST
6738     redo A;
6739     } elsif ($self->{nc} == 0x0028) { # (
6740     ## XML5: Same as "anything else".
6741     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6742    
6743     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744     $self->{line_prev} = $self->{line};
6745     $self->{column_prev} = $self->{column};
6746     $self->{column}++;
6747     $self->{nc}
6748     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749     } else {
6750     $self->{set_nc}->($self);
6751     }
6752    
6753     redo A;
6754     } elsif ($self->{nc} == -1) {
6755     ## XML5: No parse error.
6756     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6757     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6758    
6759     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760     $self->{line_prev} = $self->{line};
6761     $self->{column_prev} = $self->{column};
6762     $self->{column}++;
6763     $self->{nc}
6764     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765     } else {
6766     $self->{set_nc}->($self);
6767     }
6768    
6769     return ($self->{ct});
6770     redo A;
6771     } else {
6772     ## XML5: Not defined yet.
6773     $self->{ca}->{type} = chr $self->{nc};
6774     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6775    
6776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777     $self->{line_prev} = $self->{line};
6778     $self->{column_prev} = $self->{column};
6779     $self->{column}++;
6780     $self->{nc}
6781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782     } else {
6783     $self->{set_nc}->($self);
6784     }
6785    
6786     redo A;
6787     }
6788     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6789     if ($is_space->{$self->{nc}}) {
6790     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6791    
6792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793     $self->{line_prev} = $self->{line};
6794     $self->{column_prev} = $self->{column};
6795     $self->{column}++;
6796     $self->{nc}
6797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798     } else {
6799     $self->{set_nc}->($self);
6800     }
6801    
6802     redo A;
6803     } elsif ($self->{nc} == 0x0023) { # #
6804     ## XML5: Same as "anything else".
6805     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6806     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6807    
6808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809     $self->{line_prev} = $self->{line};
6810     $self->{column_prev} = $self->{column};
6811     $self->{column}++;
6812     $self->{nc}
6813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814     } else {
6815     $self->{set_nc}->($self);
6816     }
6817    
6818     redo A;
6819     } elsif ($self->{nc} == 0x0022) { # "
6820     ## XML5: Same as "anything else".
6821     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6822     $self->{ca}->{value} = '';
6823     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6824    
6825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826     $self->{line_prev} = $self->{line};
6827     $self->{column_prev} = $self->{column};
6828     $self->{column}++;
6829     $self->{nc}
6830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831     } else {
6832     $self->{set_nc}->($self);
6833     }
6834    
6835     redo A;
6836     } elsif ($self->{nc} == 0x0027) { # '
6837     ## XML5: Same as "anything else".
6838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6839     $self->{ca}->{value} = '';
6840     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6841    
6842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6843     $self->{line_prev} = $self->{line};
6844     $self->{column_prev} = $self->{column};
6845     $self->{column}++;
6846     $self->{nc}
6847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6848     } else {
6849     $self->{set_nc}->($self);
6850     }
6851    
6852     redo A;
6853     } elsif ($self->{nc} == 0x003E) { # >
6854     ## XML5: Same as "anything else".
6855     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6856     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6857    
6858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859     $self->{line_prev} = $self->{line};
6860     $self->{column_prev} = $self->{column};
6861     $self->{column}++;
6862     $self->{nc}
6863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864     } else {
6865     $self->{set_nc}->($self);
6866     }
6867    
6868     return ($self->{ct}); # ATTLIST
6869     redo A;
6870     } elsif ($self->{nc} == 0x0028) { # (
6871     ## XML5: Same as "anything else".
6872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6873     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6874    
6875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876     $self->{line_prev} = $self->{line};
6877     $self->{column_prev} = $self->{column};
6878     $self->{column}++;
6879     $self->{nc}
6880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881     } else {
6882     $self->{set_nc}->($self);
6883     }
6884    
6885     redo A;
6886     } elsif ($self->{nc} == -1) {
6887     ## XML5: No parse error.
6888     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6889     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6890    
6891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892     $self->{line_prev} = $self->{line};
6893     $self->{column_prev} = $self->{column};
6894     $self->{column}++;
6895     $self->{nc}
6896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897     } else {
6898     $self->{set_nc}->($self);
6899     }
6900    
6901     return ($self->{ct});
6902     redo A;
6903     } else {
6904     ## XML5: Not defined yet.
6905     $self->{ca}->{type} .= chr $self->{nc};
6906     ## Stay in the state.
6907    
6908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6909     $self->{line_prev} = $self->{line};
6910     $self->{column_prev} = $self->{column};
6911     $self->{column}++;
6912     $self->{nc}
6913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6914     } else {
6915     $self->{set_nc}->($self);
6916     }
6917    
6918     redo A;
6919     }
6920     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6921     if ($is_space->{$self->{nc}}) {
6922     ## Stay in the state.
6923    
6924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925     $self->{line_prev} = $self->{line};
6926     $self->{column_prev} = $self->{column};
6927     $self->{column}++;
6928     $self->{nc}
6929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930     } else {
6931     $self->{set_nc}->($self);
6932     }
6933    
6934     redo A;
6935     } elsif ($self->{nc} == 0x0028) { # (
6936     ## XML5: Same as "anything else".
6937     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6938    
6939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940     $self->{line_prev} = $self->{line};
6941     $self->{column_prev} = $self->{column};
6942     $self->{column}++;
6943     $self->{nc}
6944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945     } else {
6946     $self->{set_nc}->($self);
6947     }
6948    
6949     redo A;
6950     } elsif ($self->{nc} == 0x0023) { # #
6951     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6952    
6953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6954     $self->{line_prev} = $self->{line};
6955     $self->{column_prev} = $self->{column};
6956     $self->{column}++;
6957     $self->{nc}
6958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6959     } else {
6960     $self->{set_nc}->($self);
6961     }
6962    
6963     redo A;
6964     } elsif ($self->{nc} == 0x0022) { # "
6965     ## XML5: Same as "anything else".
6966     $self->{ca}->{value} = '';
6967     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6968    
6969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6970     $self->{line_prev} = $self->{line};
6971     $self->{column_prev} = $self->{column};
6972     $self->{column}++;
6973     $self->{nc}
6974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6975     } else {
6976     $self->{set_nc}->($self);
6977     }
6978    
6979     redo A;
6980     } elsif ($self->{nc} == 0x0027) { # '
6981     ## XML5: Same as "anything else".
6982     $self->{ca}->{value} = '';
6983     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6984    
6985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6986     $self->{line_prev} = $self->{line};
6987     $self->{column_prev} = $self->{column};
6988     $self->{column}++;
6989     $self->{nc}
6990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6991     } else {
6992     $self->{set_nc}->($self);
6993     }
6994    
6995     redo A;
6996     } elsif ($self->{nc} == 0x003E) { # >
6997     ## XML5: Same as "anything else".
6998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6999     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7000    
7001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002     $self->{line_prev} = $self->{line};
7003     $self->{column_prev} = $self->{column};
7004     $self->{column}++;
7005     $self->{nc}
7006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007     } else {
7008     $self->{set_nc}->($self);
7009     }
7010    
7011     return ($self->{ct}); # ATTLIST
7012     redo A;
7013     } elsif ($self->{nc} == -1) {
7014     ## XML5: No parse error.
7015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7016     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7017    
7018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7019     $self->{line_prev} = $self->{line};
7020     $self->{column_prev} = $self->{column};
7021     $self->{column}++;
7022     $self->{nc}
7023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7024     } else {
7025     $self->{set_nc}->($self);
7026     }
7027    
7028     return ($self->{ct});
7029     redo A;
7030     } else {
7031     ## XML5: Switch to the "DOCTYPE bogus comment state".
7032     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7033     $self->{ca}->{value} = '';
7034     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7035     ## Reconsume.
7036     redo A;
7037     }
7038     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7039     if ($is_space->{$self->{nc}}) {
7040     ## Stay in the state.
7041    
7042     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7043     $self->{line_prev} = $self->{line};
7044     $self->{column_prev} = $self->{column};
7045     $self->{column}++;
7046     $self->{nc}
7047     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7048     } else {
7049     $self->{set_nc}->($self);
7050     }
7051    
7052     redo A;
7053     } elsif ($self->{nc} == 0x007C) { # |
7054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7055     ## Stay in the state.
7056    
7057     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7058     $self->{line_prev} = $self->{line};
7059     $self->{column_prev} = $self->{column};
7060     $self->{column}++;
7061     $self->{nc}
7062     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7063     } else {
7064     $self->{set_nc}->($self);
7065     }
7066    
7067     redo A;
7068     } elsif ($self->{nc} == 0x0029) { # )
7069     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7070     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7071    
7072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7073     $self->{line_prev} = $self->{line};
7074     $self->{column_prev} = $self->{column};
7075     $self->{column}++;
7076     $self->{nc}
7077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7078     } else {
7079     $self->{set_nc}->($self);
7080     }
7081    
7082     redo A;
7083     } elsif ($self->{nc} == 0x003E) { # >
7084     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7085     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7086    
7087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7088     $self->{line_prev} = $self->{line};
7089     $self->{column_prev} = $self->{column};
7090     $self->{column}++;
7091     $self->{nc}
7092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7093     } else {
7094     $self->{set_nc}->($self);
7095     }
7096    
7097     return ($self->{ct}); # ATTLIST
7098     redo A;
7099     } elsif ($self->{nc} == -1) {
7100     ## XML5: No parse error.
7101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7102     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7103    
7104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105     $self->{line_prev} = $self->{line};
7106     $self->{column_prev} = $self->{column};
7107     $self->{column}++;
7108     $self->{nc}
7109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110     } else {
7111     $self->{set_nc}->($self);
7112     }
7113    
7114     return ($self->{ct});
7115     redo A;
7116     } else {
7117     push @{$self->{ca}->{tokens}}, chr $self->{nc};
7118     $self->{state} = ALLOWED_TOKEN_STATE;
7119    
7120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121     $self->{line_prev} = $self->{line};
7122     $self->{column_prev} = $self->{column};
7123     $self->{column}++;
7124     $self->{nc}
7125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126     } else {
7127     $self->{set_nc}->($self);
7128     }
7129    
7130     redo A;
7131     }
7132     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7133     if ($is_space->{$self->{nc}}) {
7134     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7135    
7136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7137     $self->{line_prev} = $self->{line};
7138     $self->{column_prev} = $self->{column};
7139     $self->{column}++;
7140     $self->{nc}
7141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7142     } else {
7143     $self->{set_nc}->($self);
7144     }
7145    
7146     redo A;
7147     } elsif ($self->{nc} == 0x007C) { # |
7148     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7149    
7150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7151     $self->{line_prev} = $self->{line};
7152     $self->{column_prev} = $self->{column};
7153     $self->{column}++;
7154     $self->{nc}
7155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7156     } else {
7157     $self->{set_nc}->($self);
7158     }
7159    
7160     redo A;
7161     } elsif ($self->{nc} == 0x0029) { # )
7162     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7163    
7164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7165     $self->{line_prev} = $self->{line};
7166     $self->{column_prev} = $self->{column};
7167     $self->{column}++;
7168     $self->{nc}
7169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7170     } else {
7171     $self->{set_nc}->($self);
7172     }
7173    
7174     redo A;
7175     } elsif ($self->{nc} == 0x003E) { # >
7176     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7177     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7178    
7179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7180     $self->{line_prev} = $self->{line};
7181     $self->{column_prev} = $self->{column};
7182     $self->{column}++;
7183     $self->{nc}
7184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7185     } else {
7186     $self->{set_nc}->($self);
7187     }
7188    
7189     return ($self->{ct}); # ATTLIST
7190     redo A;
7191     } elsif ($self->{nc} == -1) {
7192     ## XML5: No parse error.
7193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7194     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7195    
7196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7197     $self->{line_prev} = $self->{line};
7198     $self->{column_prev} = $self->{column};
7199     $self->{column}++;
7200     $self->{nc}
7201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7202     } else {
7203     $self->{set_nc}->($self);
7204     }
7205    
7206     return ($self->{ct});
7207     redo A;
7208     } else {
7209     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7210     ## Stay in the state.
7211    
7212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213     $self->{line_prev} = $self->{line};
7214     $self->{column_prev} = $self->{column};
7215     $self->{column}++;
7216     $self->{nc}
7217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218     } else {
7219     $self->{set_nc}->($self);
7220     }
7221    
7222     redo A;
7223     }
7224     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7225     if ($is_space->{$self->{nc}}) {
7226     ## Stay in the state.
7227    
7228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229     $self->{line_prev} = $self->{line};
7230     $self->{column_prev} = $self->{column};
7231     $self->{column}++;
7232     $self->{nc}
7233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234     } else {
7235     $self->{set_nc}->($self);
7236     }
7237    
7238     redo A;
7239     } elsif ($self->{nc} == 0x007C) { # |
7240     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7241    
7242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7243     $self->{line_prev} = $self->{line};
7244     $self->{column_prev} = $self->{column};
7245     $self->{column}++;
7246     $self->{nc}
7247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7248     } else {
7249     $self->{set_nc}->($self);
7250     }
7251    
7252     redo A;
7253     } elsif ($self->{nc} == 0x0029) { # )
7254     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7255    
7256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7257     $self->{line_prev} = $self->{line};
7258     $self->{column_prev} = $self->{column};
7259     $self->{column}++;
7260     $self->{nc}
7261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7262     } else {
7263     $self->{set_nc}->($self);
7264     }
7265    
7266     redo A;
7267     } elsif ($self->{nc} == 0x003E) { # >
7268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7269     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7270    
7271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272     $self->{line_prev} = $self->{line};
7273     $self->{column_prev} = $self->{column};
7274     $self->{column}++;
7275     $self->{nc}
7276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277     } else {
7278     $self->{set_nc}->($self);
7279     }
7280    
7281     return ($self->{ct}); # ATTLIST
7282     redo A;
7283     } elsif ($self->{nc} == -1) {
7284     ## XML5: No parse error.
7285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7286     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7287    
7288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7289     $self->{line_prev} = $self->{line};
7290     $self->{column_prev} = $self->{column};
7291     $self->{column}++;
7292     $self->{nc}
7293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7294     } else {
7295     $self->{set_nc}->($self);
7296     }
7297    
7298     return ($self->{ct});
7299     redo A;
7300     } else {
7301     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7302     line => $self->{line_prev},
7303     column => $self->{column_prev});
7304     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7305     $self->{state} = ALLOWED_TOKEN_STATE;
7306    
7307     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308     $self->{line_prev} = $self->{line};
7309     $self->{column_prev} = $self->{column};
7310     $self->{column}++;
7311     $self->{nc}
7312     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313     } else {
7314     $self->{set_nc}->($self);
7315     }
7316    
7317     redo A;
7318     }
7319     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7320     if ($is_space->{$self->{nc}}) {
7321     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7322    
7323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7324     $self->{line_prev} = $self->{line};
7325     $self->{column_prev} = $self->{column};
7326     $self->{column}++;
7327     $self->{nc}
7328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7329     } else {
7330     $self->{set_nc}->($self);
7331     }
7332    
7333     redo A;
7334     } elsif ($self->{nc} == 0x0023) { # #
7335     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7336     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7337    
7338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339     $self->{line_prev} = $self->{line};
7340     $self->{column_prev} = $self->{column};
7341     $self->{column}++;
7342     $self->{nc}
7343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344     } else {
7345     $self->{set_nc}->($self);
7346     }
7347    
7348     redo A;
7349     } elsif ($self->{nc} == 0x0022) { # "
7350     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7351     $self->{ca}->{value} = '';
7352     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7353    
7354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355     $self->{line_prev} = $self->{line};
7356     $self->{column_prev} = $self->{column};
7357     $self->{column}++;
7358     $self->{nc}
7359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360     } else {
7361     $self->{set_nc}->($self);
7362     }
7363    
7364     redo A;
7365     } elsif ($self->{nc} == 0x0027) { # '
7366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7367     $self->{ca}->{value} = '';
7368     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7369    
7370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371     $self->{line_prev} = $self->{line};
7372     $self->{column_prev} = $self->{column};
7373     $self->{column}++;
7374     $self->{nc}
7375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376     } else {
7377     $self->{set_nc}->($self);
7378     }
7379    
7380     redo A;
7381     } elsif ($self->{nc} == 0x003E) { # >
7382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7383     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7384    
7385     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7386     $self->{line_prev} = $self->{line};
7387     $self->{column_prev} = $self->{column};
7388     $self->{column}++;
7389     $self->{nc}
7390     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7391     } else {
7392     $self->{set_nc}->($self);
7393     }
7394    
7395     return ($self->{ct}); # ATTLIST
7396     redo A;
7397     } elsif ($self->{nc} == -1) {
7398     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7399     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7400    
7401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402     $self->{line_prev} = $self->{line};
7403     $self->{column_prev} = $self->{column};
7404     $self->{column}++;
7405     $self->{nc}
7406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407     } else {
7408     $self->{set_nc}->($self);
7409     }
7410    
7411     return ($self->{ct});
7412     redo A;
7413     } else {
7414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7415     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7416     ## Reconsume.
7417     redo A;
7418     }
7419     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7420     if ($is_space->{$self->{nc}}) {
7421     ## Stay in the state.
7422    
7423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7424     $self->{line_prev} = $self->{line};
7425     $self->{column_prev} = $self->{column};
7426     $self->{column}++;
7427     $self->{nc}
7428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7429     } else {
7430     $self->{set_nc}->($self);
7431     }
7432    
7433     redo A;
7434     } elsif ($self->{nc} == 0x0023) { # #
7435     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7436    
7437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7438     $self->{line_prev} = $self->{line};
7439     $self->{column_prev} = $self->{column};
7440     $self->{column}++;
7441     $self->{nc}
7442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7443     } else {
7444     $self->{set_nc}->($self);
7445     }
7446    
7447     redo A;
7448     } elsif ($self->{nc} == 0x0022) { # "
7449     $self->{ca}->{value} = '';
7450     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7451    
7452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7453     $self->{line_prev} = $self->{line};
7454     $self->{column_prev} = $self->{column};
7455     $self->{column}++;
7456     $self->{nc}
7457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7458     } else {
7459     $self->{set_nc}->($self);
7460     }
7461    
7462     redo A;
7463     } elsif ($self->{nc} == 0x0027) { # '
7464     $self->{ca}->{value} = '';
7465     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7466    
7467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7468     $self->{line_prev} = $self->{line};
7469     $self->{column_prev} = $self->{column};
7470     $self->{column}++;
7471     $self->{nc}
7472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7473     } else {
7474     $self->{set_nc}->($self);
7475     }
7476    
7477     redo A;
7478     } elsif ($self->{nc} == 0x003E) { # >
7479     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7480     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7481    
7482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7483     $self->{line_prev} = $self->{line};
7484     $self->{column_prev} = $self->{column};
7485     $self->{column}++;
7486     $self->{nc}
7487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7488     } else {
7489     $self->{set_nc}->($self);
7490     }
7491    
7492     return ($self->{ct}); # ATTLIST
7493     redo A;
7494     } elsif ($self->{nc} == -1) {
7495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7496     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7497    
7498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7499     $self->{line_prev} = $self->{line};
7500     $self->{column_prev} = $self->{column};
7501     $self->{column}++;
7502     $self->{nc}
7503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7504     } else {
7505     $self->{set_nc}->($self);
7506     }
7507    
7508     return ($self->{ct});
7509     redo A;
7510     } else {
7511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7512     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7513     ## Reconsume.
7514     redo A;
7515     }
7516     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7517     if ($is_space->{$self->{nc}}) {
7518     ## XML5: No parse error.
7519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7520 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7521 wakaba 1.15 ## Reconsume.
7522     redo A;
7523     } elsif ($self->{nc} == 0x0022) { # "
7524     ## XML5: Same as "anything else".
7525     $self->{ca}->{value} = '';
7526     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7527    
7528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529     $self->{line_prev} = $self->{line};
7530     $self->{column_prev} = $self->{column};
7531     $self->{column}++;
7532     $self->{nc}
7533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534     } else {
7535     $self->{set_nc}->($self);
7536     }
7537    
7538     redo A;
7539     } elsif ($self->{nc} == 0x0027) { # '
7540     ## XML5: Same as "anything else".
7541     $self->{ca}->{value} = '';
7542     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7543    
7544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7545     $self->{line_prev} = $self->{line};
7546     $self->{column_prev} = $self->{column};
7547     $self->{column}++;
7548     $self->{nc}
7549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7550     } else {
7551     $self->{set_nc}->($self);
7552     }
7553    
7554     redo A;
7555     } elsif ($self->{nc} == 0x003E) { # >
7556     ## XML5: Same as "anything else".
7557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7558     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7559    
7560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7561     $self->{line_prev} = $self->{line};
7562     $self->{column_prev} = $self->{column};
7563     $self->{column}++;
7564     $self->{nc}
7565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7566     } else {
7567     $self->{set_nc}->($self);
7568     }
7569    
7570     return ($self->{ct}); # ATTLIST
7571     redo A;
7572     } elsif ($self->{nc} == -1) {
7573     ## XML5: No parse error.
7574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7575     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7576    
7577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7578     $self->{line_prev} = $self->{line};
7579     $self->{column_prev} = $self->{column};
7580     $self->{column}++;
7581     $self->{nc}
7582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7583     } else {
7584     $self->{set_nc}->($self);
7585     }
7586    
7587     return ($self->{ct});
7588     redo A;
7589     } else {
7590     $self->{ca}->{default} = chr $self->{nc};
7591     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7592    
7593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7594     $self->{line_prev} = $self->{line};
7595     $self->{column_prev} = $self->{column};
7596     $self->{column}++;
7597     $self->{nc}
7598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7599     } else {
7600     $self->{set_nc}->($self);
7601     }
7602    
7603     redo A;
7604     }
7605     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7606     if ($is_space->{$self->{nc}}) {
7607     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7608    
7609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610     $self->{line_prev} = $self->{line};
7611     $self->{column_prev} = $self->{column};
7612     $self->{column}++;
7613     $self->{nc}
7614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615     } else {
7616     $self->{set_nc}->($self);
7617     }
7618    
7619     redo A;
7620     } elsif ($self->{nc} == 0x0022) { # "
7621     ## XML5: Same as "anything else".
7622     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7623     $self->{ca}->{value} = '';
7624     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7625    
7626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7627     $self->{line_prev} = $self->{line};
7628     $self->{column_prev} = $self->{column};
7629     $self->{column}++;
7630     $self->{nc}
7631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7632     } else {
7633     $self->{set_nc}->($self);
7634     }
7635    
7636     redo A;
7637     } elsif ($self->{nc} == 0x0027) { # '
7638     ## XML5: Same as "anything else".
7639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7640     $self->{ca}->{value} = '';
7641     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7642    
7643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7644     $self->{line_prev} = $self->{line};
7645     $self->{column_prev} = $self->{column};
7646     $self->{column}++;
7647     $self->{nc}
7648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7649     } else {
7650     $self->{set_nc}->($self);
7651     }
7652    
7653     redo A;
7654     } elsif ($self->{nc} == 0x003E) { # >
7655     ## XML5: Same as "anything else".
7656     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7657     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7658    
7659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7660     $self->{line_prev} = $self->{line};
7661     $self->{column_prev} = $self->{column};
7662     $self->{column}++;
7663     $self->{nc}
7664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7665     } else {
7666     $self->{set_nc}->($self);
7667     }
7668    
7669     return ($self->{ct}); # ATTLIST
7670     redo A;
7671     } elsif ($self->{nc} == -1) {
7672     ## XML5: No parse error.
7673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7674     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7675     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7676    
7677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678     $self->{line_prev} = $self->{line};
7679     $self->{column_prev} = $self->{column};
7680     $self->{column}++;
7681     $self->{nc}
7682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683     } else {
7684     $self->{set_nc}->($self);
7685     }
7686    
7687     return ($self->{ct});
7688     redo A;
7689     } else {
7690     $self->{ca}->{default} .= chr $self->{nc};
7691     ## Stay in the state.
7692    
7693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7694     $self->{line_prev} = $self->{line};
7695     $self->{column_prev} = $self->{column};
7696     $self->{column}++;
7697     $self->{nc}
7698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7699     } else {
7700     $self->{set_nc}->($self);
7701     }
7702    
7703     redo A;
7704     }
7705     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7706     if ($is_space->{$self->{nc}}) {
7707     ## Stay in the state.
7708    
7709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710     $self->{line_prev} = $self->{line};
7711     $self->{column_prev} = $self->{column};
7712     $self->{column}++;
7713     $self->{nc}
7714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715     } else {
7716     $self->{set_nc}->($self);
7717     }
7718    
7719     redo A;
7720     } elsif ($self->{nc} == 0x0022) { # "
7721     $self->{ca}->{value} = '';
7722     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7723    
7724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725     $self->{line_prev} = $self->{line};
7726     $self->{column_prev} = $self->{column};
7727     $self->{column}++;
7728     $self->{nc}
7729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730     } else {
7731     $self->{set_nc}->($self);
7732     }
7733    
7734     redo A;
7735     } elsif ($self->{nc} == 0x0027) { # '
7736     $self->{ca}->{value} = '';
7737     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7738    
7739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740     $self->{line_prev} = $self->{line};
7741     $self->{column_prev} = $self->{column};
7742     $self->{column}++;
7743     $self->{nc}
7744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745     } else {
7746     $self->{set_nc}->($self);
7747     }
7748    
7749     redo A;
7750     } elsif ($self->{nc} == 0x003E) { # >
7751     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753    
7754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755     $self->{line_prev} = $self->{line};
7756     $self->{column_prev} = $self->{column};
7757     $self->{column}++;
7758     $self->{nc}
7759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760     } else {
7761     $self->{set_nc}->($self);
7762     }
7763    
7764     return ($self->{ct}); # ATTLIST
7765     redo A;
7766     } elsif ($self->{nc} == -1) {
7767     ## XML5: No parse error.
7768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7769     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7770     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7771    
7772     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773     $self->{line_prev} = $self->{line};
7774     $self->{column_prev} = $self->{column};
7775     $self->{column}++;
7776     $self->{nc}
7777     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778     } else {
7779     $self->{set_nc}->($self);
7780     }
7781    
7782     return ($self->{ct});
7783     redo A;
7784     } else {
7785     ## XML5: Not defined yet.
7786     if ($self->{ca}->{default} eq 'FIXED') {
7787     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7788     } else {
7789     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7790     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7791     }
7792     ## Reconsume.
7793     redo A;
7794     }
7795     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7796     if ($is_space->{$self->{nc}} or
7797     $self->{nc} == -1 or
7798     $self->{nc} == 0x003E) { # >
7799     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7800     ## Reconsume.
7801     redo A;
7802     } else {
7803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7804     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7805     ## Reconsume.
7806     redo A;
7807 wakaba 1.16 }
7808 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7809     ## ASCII case-insensitive
7810     if ($self->{nc} == [
7811     undef,
7812     0x0044, # D
7813     0x0041, # A
7814     0x0054, # T
7815     ]->[length $self->{kwd}] or
7816     $self->{nc} == [
7817     undef,
7818     0x0064, # d
7819     0x0061, # a
7820     0x0074, # t
7821     ]->[length $self->{kwd}]) {
7822    
7823     ## Stay in the state.
7824     $self->{kwd} .= chr $self->{nc};
7825    
7826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827     $self->{line_prev} = $self->{line};
7828     $self->{column_prev} = $self->{column};
7829     $self->{column}++;
7830     $self->{nc}
7831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832     } else {
7833     $self->{set_nc}->($self);
7834     }
7835    
7836     redo A;
7837     } elsif ((length $self->{kwd}) == 4 and
7838     ($self->{nc} == 0x0041 or # A
7839     $self->{nc} == 0x0061)) { # a
7840     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7841    
7842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7843     text => 'NDATA',
7844     line => $self->{line_prev},
7845     column => $self->{column_prev} - 4);
7846     } else {
7847    
7848     }
7849     $self->{state} = AFTER_NDATA_STATE;
7850    
7851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7852     $self->{line_prev} = $self->{line};
7853     $self->{column_prev} = $self->{column};
7854     $self->{column}++;
7855     $self->{nc}
7856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7857     } else {
7858     $self->{set_nc}->($self);
7859     }
7860    
7861     redo A;
7862     } else {
7863     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7864     line => $self->{line_prev},
7865     column => $self->{column_prev} + 1
7866     - length $self->{kwd});
7867    
7868     $self->{state} = BOGUS_MD_STATE;
7869     ## Reconsume.
7870     redo A;
7871     }
7872     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7873     if ($is_space->{$self->{nc}}) {
7874     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7875    
7876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7877     $self->{line_prev} = $self->{line};
7878     $self->{column_prev} = $self->{column};
7879     $self->{column}++;
7880     $self->{nc}
7881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7882     } else {
7883     $self->{set_nc}->($self);
7884     }
7885    
7886     redo A;
7887     } elsif ($self->{nc} == 0x003E) { # >
7888     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7889     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7890    
7891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7892     $self->{line_prev} = $self->{line};
7893     $self->{column_prev} = $self->{column};
7894     $self->{column}++;
7895     $self->{nc}
7896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7897     } else {
7898     $self->{set_nc}->($self);
7899     }
7900    
7901     return ($self->{ct}); # ENTITY
7902     redo A;
7903     } elsif ($self->{nc} == -1) {
7904     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7905     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7906    
7907     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7908     $self->{line_prev} = $self->{line};
7909     $self->{column_prev} = $self->{column};
7910     $self->{column}++;
7911     $self->{nc}
7912     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7913     } else {
7914     $self->{set_nc}->($self);
7915     }
7916    
7917     return ($self->{ct}); # ENTITY
7918     redo A;
7919     } else {
7920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7921     line => $self->{line_prev},
7922     column => $self->{column_prev} + 1
7923     - length $self->{kwd});
7924     $self->{state} = BOGUS_MD_STATE;
7925     ## Reconsume.
7926     redo A;
7927     }
7928     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7929     if ($is_space->{$self->{nc}}) {
7930     ## Stay in the state.
7931    
7932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7933     $self->{line_prev} = $self->{line};
7934     $self->{column_prev} = $self->{column};
7935     $self->{column}++;
7936     $self->{nc}
7937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7938     } else {
7939     $self->{set_nc}->($self);
7940     }
7941    
7942     redo A;
7943     } elsif ($self->{nc} == 0x003E) { # >
7944     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7945     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7946    
7947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7948     $self->{line_prev} = $self->{line};
7949     $self->{column_prev} = $self->{column};
7950     $self->{column}++;
7951     $self->{nc}
7952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7953     } else {
7954     $self->{set_nc}->($self);
7955     }
7956    
7957     return ($self->{ct}); # ENTITY
7958     redo A;
7959     } elsif ($self->{nc} == -1) {
7960     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7961     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7962    
7963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964     $self->{line_prev} = $self->{line};
7965     $self->{column_prev} = $self->{column};
7966     $self->{column}++;
7967     $self->{nc}
7968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969     } else {
7970     $self->{set_nc}->($self);
7971     }
7972    
7973     return ($self->{ct}); # ENTITY
7974     redo A;
7975     } else {
7976     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7977     $self->{state} = NOTATION_NAME_STATE;
7978    
7979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7980     $self->{line_prev} = $self->{line};
7981     $self->{column_prev} = $self->{column};
7982     $self->{column}++;
7983     $self->{nc}
7984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7985     } else {
7986     $self->{set_nc}->($self);
7987     }
7988    
7989     redo A;
7990     }
7991     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7992     if ($is_space->{$self->{nc}}) {
7993 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7994 wakaba 1.18
7995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996     $self->{line_prev} = $self->{line};
7997     $self->{column_prev} = $self->{column};
7998     $self->{column}++;
7999     $self->{nc}
8000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8001     } else {
8002     $self->{set_nc}->($self);
8003     }
8004    
8005     redo A;
8006     } elsif ($self->{nc} == 0x003E) { # >
8007     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8008    
8009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8010     $self->{line_prev} = $self->{line};
8011     $self->{column_prev} = $self->{column};
8012     $self->{column}++;
8013     $self->{nc}
8014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8015     } else {
8016     $self->{set_nc}->($self);
8017     }
8018    
8019     return ($self->{ct}); # ENTITY
8020     redo A;
8021     } elsif ($self->{nc} == -1) {
8022     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8023     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8024    
8025     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8026     $self->{line_prev} = $self->{line};
8027     $self->{column_prev} = $self->{column};
8028     $self->{column}++;
8029     $self->{nc}
8030     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8031     } else {
8032     $self->{set_nc}->($self);
8033     }
8034    
8035     return ($self->{ct}); # ENTITY
8036     redo A;
8037     } else {
8038     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8039     ## Stay in the state.
8040    
8041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8042     $self->{line_prev} = $self->{line};
8043     $self->{column_prev} = $self->{column};
8044     $self->{column}++;
8045     $self->{nc}
8046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8047     } else {
8048     $self->{set_nc}->($self);
8049     }
8050    
8051     redo A;
8052     }
8053 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8054     if ($self->{nc} == 0x0022) { # "
8055 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8056 wakaba 1.19
8057     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8058     $self->{line_prev} = $self->{line};
8059     $self->{column_prev} = $self->{column};
8060     $self->{column}++;
8061     $self->{nc}
8062     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8063     } else {
8064     $self->{set_nc}->($self);
8065     }
8066    
8067     redo A;
8068     } elsif ($self->{nc} == 0x0026) { # &
8069     $self->{prev_state} = $self->{state};
8070     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8071     $self->{entity_add} = 0x0022; # "
8072    
8073     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8074     $self->{line_prev} = $self->{line};
8075     $self->{column_prev} = $self->{column};
8076     $self->{column}++;
8077     $self->{nc}
8078     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8079     } else {
8080     $self->{set_nc}->($self);
8081     }
8082    
8083     redo A;
8084     ## TODO: %
8085     } elsif ($self->{nc} == -1) {
8086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8087     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8088     ## Reconsume.
8089     return ($self->{ct}); # ENTITY
8090     redo A;
8091     } else {
8092     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8093    
8094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095     $self->{line_prev} = $self->{line};
8096     $self->{column_prev} = $self->{column};
8097     $self->{column}++;
8098     $self->{nc}
8099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100     } else {
8101     $self->{set_nc}->($self);
8102     }
8103    
8104     redo A;
8105     }
8106     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8107     if ($self->{nc} == 0x0027) { # '
8108 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8109 wakaba 1.19
8110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111     $self->{line_prev} = $self->{line};
8112     $self->{column_prev} = $self->{column};
8113     $self->{column}++;
8114     $self->{nc}
8115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116     } else {
8117     $self->{set_nc}->($self);
8118     }
8119    
8120     redo A;
8121     } elsif ($self->{nc} == 0x0026) { # &
8122     $self->{prev_state} = $self->{state};
8123     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8124     $self->{entity_add} = 0x0027; # '
8125    
8126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127     $self->{line_prev} = $self->{line};
8128     $self->{column_prev} = $self->{column};
8129     $self->{column}++;
8130     $self->{nc}
8131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132     } else {
8133     $self->{set_nc}->($self);
8134     }
8135    
8136     redo A;
8137     ## TODO: %
8138     } elsif ($self->{nc} == -1) {
8139     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8140     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141     ## Reconsume.
8142     return ($self->{ct}); # ENTITY
8143     redo A;
8144     } else {
8145     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8146    
8147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148     $self->{line_prev} = $self->{line};
8149     $self->{column_prev} = $self->{column};
8150     $self->{column}++;
8151     $self->{nc}
8152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153     } else {
8154     $self->{set_nc}->($self);
8155     }
8156    
8157     redo A;
8158     }
8159     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8160     if ($is_space->{$self->{nc}} or
8161     {
8162     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8163     $self->{entity_add} => 1,
8164     }->{$self->{nc}}) {
8165 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8166     line => $self->{line_prev},
8167     column => $self->{column_prev}
8168     + ($self->{nc} == -1 ? 1 : 0));
8169 wakaba 1.19 ## Don't consume
8170     ## Return nothing.
8171     #
8172     } elsif ($self->{nc} == 0x0023) { # #
8173     $self->{ca} = $self->{ct};
8174     $self->{state} = ENTITY_HASH_STATE;
8175     $self->{kwd} = '#';
8176    
8177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178     $self->{line_prev} = $self->{line};
8179     $self->{column_prev} = $self->{column};
8180     $self->{column}++;
8181     $self->{nc}
8182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183     } else {
8184     $self->{set_nc}->($self);
8185     }
8186    
8187     redo A;
8188     } else {
8189     #
8190     }
8191    
8192     $self->{ct}->{value} .= '&';
8193     $self->{state} = $self->{prev_state};
8194     ## Reconsume.
8195     redo A;
8196 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8197     if ($is_space->{$self->{nc}}) {
8198     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8199    
8200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201     $self->{line_prev} = $self->{line};
8202     $self->{column_prev} = $self->{column};
8203     $self->{column}++;
8204     $self->{nc}
8205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206     } else {
8207     $self->{set_nc}->($self);
8208     }
8209    
8210     redo A;
8211     } elsif ($self->{nc} == 0x0028) { # (
8212     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8213     $self->{ct}->{content} = ['('];
8214     $self->{group_depth} = 1;
8215    
8216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8217     $self->{line_prev} = $self->{line};
8218     $self->{column_prev} = $self->{column};
8219     $self->{column}++;
8220     $self->{nc}
8221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8222     } else {
8223     $self->{set_nc}->($self);
8224     }
8225    
8226     redo A;
8227     } elsif ($self->{nc} == 0x003E) { # >
8228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8229     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8230    
8231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232     $self->{line_prev} = $self->{line};
8233     $self->{column_prev} = $self->{column};
8234     $self->{column}++;
8235     $self->{nc}
8236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237     } else {
8238     $self->{set_nc}->($self);
8239     }
8240    
8241     return ($self->{ct}); # ELEMENT
8242     redo A;
8243     } elsif ($self->{nc} == -1) {
8244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8246    
8247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248     $self->{line_prev} = $self->{line};
8249     $self->{column_prev} = $self->{column};
8250     $self->{column}++;
8251     $self->{nc}
8252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253     } else {
8254     $self->{set_nc}->($self);
8255     }
8256    
8257     return ($self->{ct}); # ELEMENT
8258     redo A;
8259     } else {
8260     $self->{ct}->{content} = [chr $self->{nc}];
8261     $self->{state} = CONTENT_KEYWORD_STATE;
8262    
8263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264     $self->{line_prev} = $self->{line};
8265     $self->{column_prev} = $self->{column};
8266     $self->{column}++;
8267     $self->{nc}
8268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269     } else {
8270     $self->{set_nc}->($self);
8271     }
8272    
8273     redo A;
8274     }
8275     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8276     if ($is_space->{$self->{nc}}) {
8277     $self->{state} = AFTER_MD_DEF_STATE;
8278    
8279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280     $self->{line_prev} = $self->{line};
8281     $self->{column_prev} = $self->{column};
8282     $self->{column}++;
8283     $self->{nc}
8284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285     } else {
8286     $self->{set_nc}->($self);
8287     }
8288    
8289     redo A;
8290     } elsif ($self->{nc} == 0x003E) { # >
8291     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8292    
8293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8294     $self->{line_prev} = $self->{line};
8295     $self->{column_prev} = $self->{column};
8296     $self->{column}++;
8297     $self->{nc}
8298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8299     } else {
8300     $self->{set_nc}->($self);
8301     }
8302    
8303     return ($self->{ct}); # ELEMENT
8304     redo A;
8305     } elsif ($self->{nc} == -1) {
8306     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8307     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8308    
8309     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8310     $self->{line_prev} = $self->{line};
8311     $self->{column_prev} = $self->{column};
8312     $self->{column}++;
8313     $self->{nc}
8314     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8315     } else {
8316     $self->{set_nc}->($self);
8317     }
8318    
8319     return ($self->{ct}); # ELEMENT
8320     redo A;
8321     } else {
8322     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8323     ## Stay in the state.
8324    
8325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326     $self->{line_prev} = $self->{line};
8327     $self->{column_prev} = $self->{column};
8328     $self->{column}++;
8329     $self->{nc}
8330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331     } else {
8332     $self->{set_nc}->($self);
8333     }
8334    
8335     redo A;
8336     }
8337     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8338     if ($is_space->{$self->{nc}}) {
8339     ## Stay in the state.
8340    
8341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342     $self->{line_prev} = $self->{line};
8343     $self->{column_prev} = $self->{column};
8344     $self->{column}++;
8345     $self->{nc}
8346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347     } else {
8348     $self->{set_nc}->($self);
8349     }
8350    
8351     redo A;
8352     } elsif ($self->{nc} == 0x0028) { # (
8353     $self->{group_depth}++;
8354     push @{$self->{ct}->{content}}, chr $self->{nc};
8355     ## Stay in the state.
8356    
8357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358     $self->{line_prev} = $self->{line};
8359     $self->{column_prev} = $self->{column};
8360     $self->{column}++;
8361     $self->{nc}
8362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363     } else {
8364     $self->{set_nc}->($self);
8365     }
8366    
8367     redo A;
8368     } elsif ($self->{nc} == 0x007C or # |
8369     $self->{nc} == 0x002C) { # ,
8370     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8371     ## Stay in the state.
8372    
8373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374     $self->{line_prev} = $self->{line};
8375     $self->{column_prev} = $self->{column};
8376     $self->{column}++;
8377     $self->{nc}
8378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379     } else {
8380     $self->{set_nc}->($self);
8381     }
8382    
8383     redo A;
8384     } elsif ($self->{nc} == 0x0029) { # )
8385     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8386     push @{$self->{ct}->{content}}, chr $self->{nc};
8387     $self->{group_depth}--;
8388     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8389    
8390     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8391     $self->{line_prev} = $self->{line};
8392     $self->{column_prev} = $self->{column};
8393     $self->{column}++;
8394     $self->{nc}
8395     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8396     } else {
8397     $self->{set_nc}->($self);
8398     }
8399    
8400     redo A;
8401     } elsif ($self->{nc} == 0x003E) { # >
8402     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8403     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405    
8406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8407     $self->{line_prev} = $self->{line};
8408     $self->{column_prev} = $self->{column};
8409     $self->{column}++;
8410     $self->{nc}
8411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8412     } else {
8413     $self->{set_nc}->($self);
8414     }
8415    
8416     return ($self->{ct}); # ELEMENT
8417     redo A;
8418     } elsif ($self->{nc} == -1) {
8419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8420     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8421     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8422    
8423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8424     $self->{line_prev} = $self->{line};
8425     $self->{column_prev} = $self->{column};
8426     $self->{column}++;
8427     $self->{nc}
8428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8429     } else {
8430     $self->{set_nc}->($self);
8431     }
8432    
8433     return ($self->{ct}); # ELEMENT
8434     redo A;
8435     } else {
8436     push @{$self->{ct}->{content}}, chr $self->{nc};
8437     $self->{state} = CM_ELEMENT_NAME_STATE;
8438    
8439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440     $self->{line_prev} = $self->{line};
8441     $self->{column_prev} = $self->{column};
8442     $self->{column}++;
8443     $self->{nc}
8444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445     } else {
8446     $self->{set_nc}->($self);
8447     }
8448    
8449     redo A;
8450     }
8451     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8452     if ($is_space->{$self->{nc}}) {
8453     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454    
8455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8456     $self->{line_prev} = $self->{line};
8457     $self->{column_prev} = $self->{column};
8458     $self->{column}++;
8459     $self->{nc}
8460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8461     } else {
8462     $self->{set_nc}->($self);
8463     }
8464    
8465     redo A;
8466     } elsif ($self->{nc} == 0x002A or # *
8467     $self->{nc} == 0x002B or # +
8468     $self->{nc} == 0x003F) { # ?
8469     push @{$self->{ct}->{content}}, chr $self->{nc};
8470     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8471    
8472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8473     $self->{line_prev} = $self->{line};
8474     $self->{column_prev} = $self->{column};
8475     $self->{column}++;
8476     $self->{nc}
8477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8478     } else {
8479     $self->{set_nc}->($self);
8480     }
8481    
8482     redo A;
8483     } elsif ($self->{nc} == 0x007C or # |
8484     $self->{nc} == 0x002C) { # ,
8485     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8486     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8487    
8488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8489     $self->{line_prev} = $self->{line};
8490     $self->{column_prev} = $self->{column};
8491     $self->{column}++;
8492     $self->{nc}
8493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8494     } else {
8495     $self->{set_nc}->($self);
8496     }
8497    
8498     redo A;
8499     } elsif ($self->{nc} == 0x0029) { # )
8500     $self->{group_depth}--;
8501     push @{$self->{ct}->{content}}, chr $self->{nc};
8502     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8503    
8504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8505     $self->{line_prev} = $self->{line};
8506     $self->{column_prev} = $self->{column};
8507     $self->{column}++;
8508     $self->{nc}
8509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8510     } else {
8511     $self->{set_nc}->($self);
8512     }
8513    
8514     redo A;
8515     } elsif ($self->{nc} == 0x003E) { # >
8516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8517     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8519    
8520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521     $self->{line_prev} = $self->{line};
8522     $self->{column_prev} = $self->{column};
8523     $self->{column}++;
8524     $self->{nc}
8525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526     } else {
8527     $self->{set_nc}->($self);
8528     }
8529    
8530     return ($self->{ct}); # ELEMENT
8531     redo A;
8532     } elsif ($self->{nc} == -1) {
8533     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8534     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8535     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8536    
8537     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538     $self->{line_prev} = $self->{line};
8539     $self->{column_prev} = $self->{column};
8540     $self->{column}++;
8541     $self->{nc}
8542     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543     } else {
8544     $self->{set_nc}->($self);
8545     }
8546    
8547     return ($self->{ct}); # ELEMENT
8548     redo A;
8549     } else {
8550     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8551     ## Stay in the state.
8552    
8553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8554     $self->{line_prev} = $self->{line};
8555     $self->{column_prev} = $self->{column};
8556     $self->{column}++;
8557     $self->{nc}
8558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8559     } else {
8560     $self->{set_nc}->($self);
8561     }
8562    
8563     redo A;
8564     }
8565     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8566     if ($is_space->{$self->{nc}}) {
8567     ## Stay in the state.
8568    
8569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8570     $self->{line_prev} = $self->{line};
8571     $self->{column_prev} = $self->{column};
8572     $self->{column}++;
8573     $self->{nc}
8574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8575     } else {
8576     $self->{set_nc}->($self);
8577     }
8578    
8579     redo A;
8580     } elsif ($self->{nc} == 0x007C or # |
8581     $self->{nc} == 0x002C) { # ,
8582     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8583     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8584    
8585     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8586     $self->{line_prev} = $self->{line};
8587     $self->{column_prev} = $self->{column};
8588     $self->{column}++;
8589     $self->{nc}
8590     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8591     } else {
8592     $self->{set_nc}->($self);
8593     }
8594    
8595     redo A;
8596     } elsif ($self->{nc} == 0x0029) { # )
8597     $self->{group_depth}--;
8598     push @{$self->{ct}->{content}}, chr $self->{nc};
8599     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8600    
8601     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602     $self->{line_prev} = $self->{line};
8603     $self->{column_prev} = $self->{column};
8604     $self->{column}++;
8605     $self->{nc}
8606     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607     } else {
8608     $self->{set_nc}->($self);
8609     }
8610    
8611     redo A;
8612     } elsif ($self->{nc} == 0x003E) { # >
8613     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8614     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8615     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8616    
8617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8618     $self->{line_prev} = $self->{line};
8619     $self->{column_prev} = $self->{column};
8620     $self->{column}++;
8621     $self->{nc}
8622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8623     } else {
8624     $self->{set_nc}->($self);
8625     }
8626    
8627     return ($self->{ct}); # ELEMENT
8628     redo A;
8629     } elsif ($self->{nc} == -1) {
8630     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8631     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8632     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633    
8634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8635     $self->{line_prev} = $self->{line};
8636     $self->{column_prev} = $self->{column};
8637     $self->{column}++;
8638     $self->{nc}
8639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8640     } else {
8641     $self->{set_nc}->($self);
8642     }
8643    
8644     return ($self->{ct}); # ELEMENT
8645     redo A;
8646     } else {
8647     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8648     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8649     $self->{state} = BOGUS_MD_STATE;
8650    
8651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652     $self->{line_prev} = $self->{line};
8653     $self->{column_prev} = $self->{column};
8654     $self->{column}++;
8655     $self->{nc}
8656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657     } else {
8658     $self->{set_nc}->($self);
8659     }
8660    
8661     redo A;
8662     }
8663     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8664     if ($is_space->{$self->{nc}}) {
8665     if ($self->{group_depth}) {
8666     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8667     } else {
8668     $self->{state} = AFTER_MD_DEF_STATE;
8669     }
8670    
8671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8672     $self->{line_prev} = $self->{line};
8673     $self->{column_prev} = $self->{column};
8674     $self->{column}++;
8675     $self->{nc}
8676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8677     } else {
8678     $self->{set_nc}->($self);
8679     }
8680    
8681     redo A;
8682     } elsif ($self->{nc} == 0x002A or # *
8683     $self->{nc} == 0x002B or # +
8684     $self->{nc} == 0x003F) { # ?
8685     push @{$self->{ct}->{content}}, chr $self->{nc};
8686     if ($self->{group_depth}) {
8687     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8688     } else {
8689     $self->{state} = AFTER_MD_DEF_STATE;
8690     }
8691    
8692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693     $self->{line_prev} = $self->{line};
8694     $self->{column_prev} = $self->{column};
8695     $self->{column}++;
8696     $self->{nc}
8697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698     } else {
8699     $self->{set_nc}->($self);
8700     }
8701    
8702     redo A;
8703     } elsif ($self->{nc} == 0x0029) { # )
8704     if ($self->{group_depth}) {
8705     $self->{group_depth}--;
8706     push @{$self->{ct}->{content}}, chr $self->{nc};
8707     ## Stay in the state.
8708    
8709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710     $self->{line_prev} = $self->{line};
8711     $self->{column_prev} = $self->{column};
8712     $self->{column}++;
8713     $self->{nc}
8714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715     } else {
8716     $self->{set_nc}->($self);
8717     }
8718    
8719     redo A;
8720     } else {
8721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8722     $self->{state} = BOGUS_MD_STATE;
8723     ## Reconsume.
8724     redo A;
8725     }
8726     } elsif ($self->{nc} == 0x003E) { # >
8727     if ($self->{group_depth}) {
8728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8729     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8730     }
8731     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8732    
8733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8734     $self->{line_prev} = $self->{line};
8735     $self->{column_prev} = $self->{column};
8736     $self->{column}++;
8737     $self->{nc}
8738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8739     } else {
8740     $self->{set_nc}->($self);
8741     }
8742    
8743     return ($self->{ct}); # ELEMENT
8744     redo A;
8745     } elsif ($self->{nc} == -1) {
8746     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8747     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8748     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8749    
8750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8751     $self->{line_prev} = $self->{line};
8752     $self->{column_prev} = $self->{column};
8753     $self->{column}++;
8754     $self->{nc}
8755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8756     } else {
8757     $self->{set_nc}->($self);
8758     }
8759    
8760     return ($self->{ct}); # ELEMENT
8761     redo A;
8762     } else {
8763     if ($self->{group_depth}) {
8764     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8765     } else {
8766     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8767     $self->{state} = BOGUS_MD_STATE;
8768     }
8769     ## Reconsume.
8770     redo A;
8771     }
8772     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8773 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8774     ## Stay in the state.
8775    
8776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8777     $self->{line_prev} = $self->{line};
8778     $self->{column_prev} = $self->{column};
8779     $self->{column}++;
8780     $self->{nc}
8781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8782     } else {
8783     $self->{set_nc}->($self);
8784     }
8785    
8786     redo A;
8787     } elsif ($self->{nc} == 0x003E) { # >
8788     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8789    
8790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8791     $self->{line_prev} = $self->{line};
8792     $self->{column_prev} = $self->{column};
8793     $self->{column}++;
8794     $self->{nc}
8795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8796     } else {
8797     $self->{set_nc}->($self);
8798     }
8799    
8800 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8801 wakaba 1.18 redo A;
8802     } elsif ($self->{nc} == -1) {
8803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8804     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8805    
8806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8807     $self->{line_prev} = $self->{line};
8808     $self->{column_prev} = $self->{column};
8809     $self->{column}++;
8810     $self->{nc}
8811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8812     } else {
8813     $self->{set_nc}->($self);
8814     }
8815    
8816 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8817 wakaba 1.18 redo A;
8818     } else {
8819 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8820 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8821     ## Reconsume.
8822     redo A;
8823     }
8824 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8825     if ($self->{nc} == 0x003E) { # >
8826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8827    
8828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8829     $self->{line_prev} = $self->{line};
8830     $self->{column_prev} = $self->{column};
8831     $self->{column}++;
8832     $self->{nc}
8833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8834     } else {
8835     $self->{set_nc}->($self);
8836     }
8837    
8838     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8839     redo A;
8840     } elsif ($self->{nc} == -1) {
8841     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8842     ## Reconsume.
8843     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8844     redo A;
8845     } else {
8846     ## Stay in the state.
8847    
8848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8849     $self->{line_prev} = $self->{line};
8850     $self->{column_prev} = $self->{column};
8851     $self->{column}++;
8852     $self->{nc}
8853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8854     } else {
8855     $self->{set_nc}->($self);
8856     }
8857    
8858     redo A;
8859     }
8860 wakaba 1.1 } else {
8861     die "$0: $self->{state}: Unknown state";
8862     }
8863     } # A
8864    
8865     die "$0: _get_next_token: unexpected case";
8866     } # _get_next_token
8867    
8868     1;
8869 wakaba 1.34 ## $Date: 2009/09/05 10:41:07 $
8870 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24